GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Learning/src/gate/learning/NLPFeaturesOfDoc.java

/*
 *  NLPFeaturesOfDoc.java
 * 
 *  Yaoyong Li 22/03/2007
 *
 *  $Id: NLPFeaturesOfDoc.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
 */
package gate.learning;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.util.OffsetComparator;
import gate.learning.Ngram;

/*
 * Obtain the NLP (linguistic) features from the GATE annotations of one
 * document.
 */
public class NLPFeaturesOfDoc {
  /** One component stores all the features for one instance. */
  StringBuffer[] featuresInLine;
  /** Feature names. */
  StringBuffer featuresName;
  /** Document id. */
  private String docId = null;
  /** Number of instances in the document. */
  int numInstances = 0;
  /** Total number of GATE types of NLP features. */
  int totalnumTypes = 0;
  /** Number of features counted for each instance */
  int[] featuresCounted;
  /** store the class name for each instances in. */
  String[] classNames;

  /** Constructor with no parameters. */
  public NLPFeaturesOfDoc() {
    
  }
  /**
   * Constructor, obtain NLP features from GATE annotations for each instance in
   * the document.
   */
  public NLPFeaturesOfDoc(AnnotationSet annotations, String instanceType,
    String docName) {
    // Number of instances (tokens) in the document
    numInstances = annotations.get(instanceType).size();
    featuresInLine = new StringBuffer[numInstances];
    featuresName = new StringBuffer();
    totalnumTypes = 0;
    featuresCounted = new int[numInstances];
    classNames = new String[numInstances];
    docId = docName;
  }

  /**
   * Entry method for getting the NLP features according to the specifications
   * in the dataset defintion files.
   */
  public void obtainDocNLPFeatures(AnnotationSet annotations,
    DataSetDefinition dsd) {
    if(dsd.dataType == DataSetDefinition.RelationData) {
      // get the features for the relation data
      int initialPosition = 0;
      if(dsd.arg1 != null) {
        ArgOfRelation arg = dsd.arg1;
        boolean[][] isArgInRel = matchArgInstanceWithInst(annotations, dsd
          .getInstanceType(), arg.type, dsd.arg1Feat, arg.feat);
        if(arg.arrs.numTypes>0)
          gatedoc2NLPFeaturesArg(annotations, arg.type,
            arg.arrs.typesInDataSetDef, arg.arrs.featuresInDataSetDef,
            arg.arrs.namesInDataSetDef, arg.arrs.featurePosition, isArgInRel, initialPosition);
        if(arg.arrs.numNgrams>0)
          gatedoc2NgramFeaturesArg(annotations, arg.type, arg.ngrams, isArgInRel, initialPosition);
        initialPosition += arg.maxTotalPosition+1;
      }
      if(dsd.arg2 != null) {
        ArgOfRelation arg = dsd.arg2;
        boolean[][] isArgInRel = matchArgInstanceWithInst(annotations, dsd
          .getInstanceType(), arg.type, dsd.arg1Feat, arg.feat);
        if(arg.arrs.numTypes>0)
          gatedoc2NLPFeaturesArg(annotations, arg.type,
            arg.arrs.typesInDataSetDef, arg.arrs.featuresInDataSetDef,
            arg.arrs.namesInDataSetDef, arg.arrs.featurePosition, isArgInRel, initialPosition);
        if(arg.arrs.numNgrams>0)
          gatedoc2NgramFeaturesArg(annotations, arg.type, arg.ngrams, isArgInRel, initialPosition);
      }
      if(dsd.relAttributes != null)
        gatedoc2NLPFeaturesRel(annotations, dsd.getInstanceType(),
          dsd.arg1Feat, dsd.arg2Feat, dsd.arrs.typesInDataSetDef,
          dsd.arrs.featuresInDataSetDef, dsd.arrs.namesInDataSetDef,
          dsd.arrs.arg1s, dsd.arrs.arg2s, dsd.arrs.featurePosition);
      // get the label from the class attribute
      gatedoc2LabelsCompleteRel(annotations, dsd.getInstanceType(),
        dsd.arg1Feat, dsd.arg2Feat, dsd.arrs.classType, dsd.arrs.classFeature,
        dsd.arrs.classArg1, dsd.arrs.classArg2);
    } else {
      // get the NLP features from the attributes
      if(dsd.arrs.numTypes > 0)
        gatedoc2NLPFeatures(annotations, dsd.getInstanceType(),
          dsd.arrs.typesInDataSetDef, dsd.arrs.featuresInDataSetDef,
          dsd.arrs.namesInDataSetDef, dsd.arrs.featurePosition); // it
      if(dsd.arrs.numNgrams > 0)
        gatedoc2NgramFeatures(annotations, dsd.getInstanceType(), dsd
          .getNgrams());
      // get the label from the class attribute
      gatedoc2LabelsComplete(annotations, dsd.getInstanceType(),
        dsd.arrs.classType, dsd.arrs.classFeature);
    }
  }

  /** Get the N-gram features from the GATE document. */
  public void gatedoc2NgramFeatures(AnnotationSet annotations,
    String instanceType, java.util.List ngrams) {
    AnnotationSet anns = annotations.get(instanceType);
    ArrayList annotationArray = (anns == null || anns.isEmpty())
      ? new ArrayList()
      : new ArrayList(anns);
    Collections.sort(annotationArray, new OffsetComparator());
    if(numInstances != annotationArray.size()) {
      System.out.println("!!Warning: the number of instances "
        + new Integer(numInstances) + " in the document " + docId
        + " is not right!!!");
      return;
    }
    int numNgrams = ngrams.size();
    // For each ngram
    for(int i1 = 0; i1 < numNgrams; ++i1) {
      Ngram ngram = (Ngram)ngrams.get(i1);
      String nameOfNgram = ngram.getName();
      int ngramPosition = ngram.position;
      String positionStr = obtainPositionStr(ngramPosition);
      featuresName.append(nameOfNgram + ConstantParameters.ITEMSEPARATOR);
      int consNum= ngram.getConsnum();
      String [] typeGateNgram = new String[consNum];
      String [] featureGateNgram = new String[consNum];
      for(int j=0; j<consNum; ++j) {
        typeGateNgram[j] = (ngram.getTypessGate())[j];
        featureGateNgram[j] = (ngram.getFeaturesGate())[j];
      }
      AnnotationSet [] annsArray = new AnnotationSet[consNum];
      for(int j=0; j<consNum; ++j) {
        annsArray[j] = (AnnotationSet)annotations.get(typeGateNgram[j]);
      }
      for(int i = 0; i < numInstances; ++i) {
        Annotation annToken = (Annotation)annotationArray.get(i);
        Long tokenStartOffset = annToken.getStartNode().getOffset();
        Long tokenEndOffset = annToken.getEndNode().getOffset();
        //AnnotationSet annsNgramType = annotations.get(typeGateNgram,
         // tokenStartOffset, tokenEndOffset);
        AnnotationSet annsNgramType = annsArray[0].get(tokenStartOffset, tokenEndOffset);
        String[] features;
        features = obtainNgramFeatures(annsNgramType,
          featureGateNgram[0]);
        int numFeats = features.length;
        int number = ngram.getNumber();
        if(numFeats>=number) { //if the instance has enough number of features for the defined ngram
        for(int j = 1; j < consNum; j++) {
          String[] features1;
          if(typeGateNgram[j].equals(typeGateNgram[0]))
            features1 = obtainNgramFeatures(annsNgramType, featureGateNgram[j]);
          else features1 = obtainNgramFeaturesFromDifferentType(annsNgramType,
            annsArray[j].get(tokenStartOffset, tokenEndOffset), 
            featureGateNgram[j]);
          for(int j1 = 0; j1 < features.length; ++j1)
            features[j1] = features[j1] + "_" + features1[j1];
        }
        // get the ngram features
        
        StringBuffer[] featuresNgram = new StringBuffer[numFeats - number + 1];
        for(int j = 0; j < featuresNgram.length; ++j)
          featuresNgram[j] = new StringBuffer();
        for(int j = 0; j < number; ++j) {
          for(int j1 = j; j1 < numFeats - number + 1 + j; ++j1) {
              featuresNgram[j1 - j].append(features[j1]
              + NLPFeaturesList.SYMBOLNGARM);
          }
        }
        Hashtable ngramTerms = new Hashtable();
        for(int j = 0; j < featuresNgram.length; ++j)
          if(!ngramTerms.containsKey(featuresNgram[j].toString()))
            ngramTerms.put(featuresNgram[j].toString(), "1");
          else ngramTerms.put(featuresNgram[j].toString(),
            new Integer((new Integer(ngramTerms
              .get(featuresNgram[j].toString()).toString())).intValue() + 1));
        List keys = new ArrayList(ngramTerms.keySet());
        Collections.sort(keys);
        Iterator iterator = keys.iterator();
        if(featuresInLine[i] == null) featuresInLine[i] = new StringBuffer();
        while(iterator.hasNext()) {
          Object key = iterator.next();
          if(ngramPosition != 0)
            this.featuresInLine[i].append(obtainFeatureName(nameOfNgram, key
              .toString()
              + NLPFeaturesList.SYMBOLNGARM + ngramTerms.get(key).toString())
              + positionStr + ConstantParameters.ITEMSEPARATOR);
          else this.featuresInLine[i].append(obtainFeatureName(nameOfNgram, key
            .toString()
            + NLPFeaturesList.SYMBOLNGARM + ngramTerms.get(key).toString())
            + ConstantParameters.ITEMSEPARATOR);
          ++featuresCounted[i];
        }
        }//if the number of features is not less than the n of the n-gram
      }// end of the loop on instances
    } // end of the loop on number of ngrams
  }

  /**
   * Obtain the string for the position, which is attached at the end of the nlp
   * feature.
   */
  String obtainPositionStr(int ngramPosition) {
    return "[" + (new Integer(ngramPosition)).toString() + "]";
  }

  /** Obtain the N-gram features from an annotation set. */
  private String[] obtainNgramFeatures(AnnotationSet annsNgramType,
    String gateFeature) {
    int num = annsNgramType.size();
    String[] feats = new String[num];
    ArrayList annotationArray = (annsNgramType == null || annsNgramType
      .isEmpty()) ? new ArrayList() : new ArrayList(annsNgramType);
    Collections.sort(annotationArray, new OffsetComparator());
    for(int i = 0; i < num; ++i) {
      feats[i] = (String)((Annotation)annotationArray.get(i)).getFeatures()
        .get(gateFeature);
      if(feats[i]==null)
        feats[i] = ConstantParameters.NAMENONFEATURE;
      feats[i] = feats[i].trim().replaceAll(ConstantParameters.ITEMSEPARATOR,
        ConstantParameters.ITEMSEPREPLACEMENT);
    }
    return feats;
  }

  /**
   * Obtain the N-gram features from an annotation set for the Annotation type
   * which is different from the instance's type.
   */
  private String[] obtainNgramFeaturesFromDifferentType(
    AnnotationSet annsNgramType, AnnotationSet annsCurrent, String gateFeature) {
    int num = annsNgramType.size();
    String[] feats = new String[num];
    ArrayList annotationArray = (annsNgramType == null || annsNgramType
      .isEmpty()) ? new ArrayList() : new ArrayList(annsNgramType);
    Collections.sort(annotationArray, new OffsetComparator());
    for(int i = 0; i < num; ++i) {
      feats[i] = obtainAnnotationForTypeAndFeature(annsCurrent, gateFeature,
        ((Annotation)(annotationArray.get(i))).getStartNode().getOffset(),
        ((Annotation)(annotationArray.get(i))).getEndNode().getOffset());
      if(feats[i] != null)
        feats[i] = feats[i].trim().replaceAll(ConstantParameters.ITEMSEPARATOR,
        ConstantParameters.ITEMSEPREPLACEMENT);
    }
    return feats;
  }

  /** Get the labels of each instance in the document. */
  public void gatedoc2LabelsComplete(AnnotationSet annotations,
    String instanceType, String classType, String classFeature) {
    AnnotationSet anns = annotations.get(instanceType);
    ArrayList annotationArray = (anns == null || anns.isEmpty())
      ? new ArrayList()
      : new ArrayList(anns);
    Collections.sort(annotationArray, new OffsetComparator());
    if(numInstances != annotationArray.size()) {
      System.out.println("!!Warning: the number of instances "
        + new Integer(numInstances) + " in the document " + docId
        + " is not right!!!");
      return;
    }
    // For each of entity
    AnnotationSet annsEntity = annotations.get(classType);
    for(Object obj : annsEntity) {
      Annotation annEntity = (Annotation)obj;
      if(annEntity.getFeatures().get(classFeature) == null) continue;
      String featName = annEntity.getFeatures().get(classFeature).toString();
      featName = featName.trim();
      featName = featName.replaceAll(ConstantParameters.SUFFIXSTARTTOKEN,
        ConstantParameters.SUFFIXSTARTTOKEN + "_");
      featName = featName.replaceAll(ConstantParameters.ITEMSEPARATOR, "_");
      //Get the multilabel from one instance
      String [] featNameArray = featName.split(ConstantParameters.MULTILABELSEPARATOR); 
      boolean isStart = true;
      for(int i = 0; i < numInstances; ++i) {
        Annotation annToken = (Annotation)annotationArray.get(i);
        if(annToken.overlaps(annEntity)) {
          String featName0 = "";
          if(isStart) {
            for(int j=0; j<featNameArray.length; ++j) {
              if(j>0) featName0 += ConstantParameters.ITEMSEPARATOR;
              featName0 += featNameArray[j]+ConstantParameters.SUFFIXSTARTTOKEN;
            } 
            isStart = false;
          } else 
            for(int j=0; j<featNameArray.length; ++j) {
              if(j>0) featName0 += ConstantParameters.ITEMSEPARATOR;
              featName0 += featNameArray[j];
            }
          if(featName0.length() > 0) {
            if(this.classNames[i] != null)
              this.classNames[i] += ConstantParameters.ITEMSEPARATOR
                + featName0;
            else this.classNames[i] = featName0;
          }
        }
      }
    }
  }

  /** Get the Attribute feature for each instance of the document. */
  public void gatedoc2NLPFeatures(AnnotationSet annotations,
    String instanceType, String[] typesGate, String[] featuresGate,
    String[] namesGate, int[] featurePosition) {
    int numTypes = typesGate.length;
    this.totalnumTypes += numTypes;
    for(int i = 0; i < numTypes; ++i) {
      this.featuresName.append(namesGate[i] + ConstantParameters.ITEMSEPARATOR);
    }
    String[] positionArrStr = new String[numTypes];
    for(int i = 0; i < numTypes; ++i) {
      if(featurePosition[i] != 0)
        positionArrStr[i] = obtainPositionStr(featurePosition[i]);
    }
    AnnotationSet anns = annotations.get(instanceType);
    ArrayList<Annotation>annotationArray = (anns == null || anns.isEmpty())
      ? new ArrayList<Annotation>()
      : new ArrayList<Annotation>(anns);
    Collections.sort(annotationArray, new OffsetComparator());
    String[] features = new String[numTypes];
    int numInstances0 = annotationArray.size();
    AnnotationSet [] annsArray = new AnnotationSet[numTypes];
    for(int j=0; j<numTypes; ++j) {
      annsArray[j] = (AnnotationSet)annotations
      .get(typesGate[j]);
    }
    for(int i = 0; i < numInstances0; ++i) {
      // for class
      Annotation annToken;
      for(int j = 0; j < numTypes; j++) {
        // for each attribute in different positions, get the token in
        // the corresponding position
        if(featurePosition[j] == 0)
          annToken = (Annotation)annotationArray.get(i);
        else if((featurePosition[j] < 0 && i + featurePosition[j] >= 0)
          || (featurePosition[j] > 0 && i + featurePosition[j] < numInstances0))
          annToken = (Annotation)annotationArray.get(i + featurePosition[j]);
        else continue;
        if(typesGate[j].equals(instanceType)) {
          features[j] = (String)annToken.getFeatures().get(featuresGate[j]);
        } else { // if not belongs to token
          Long tokenStartOffset = annToken.getStartNode().getOffset();
          Long tokenEndOffset = annToken.getEndNode().getOffset();
          features[j] = obtainAnnotationForTypeAndFeature(annsArray[j], featuresGate[j], tokenStartOffset,
           tokenEndOffset);
        }
        // put the name into the feature name
        if(features[j] != null) {
          features[j] = features[j].trim().replaceAll(
            ConstantParameters.ITEMSEPARATOR,
            ConstantParameters.ITEMSEPREPLACEMENT);
          features[j] = obtainFeatureName(namesGate[j], features[j]);
        }
      }// end of the loop on the types
      int numCounted = 0;
      if(featuresInLine[i] == null) featuresInLine[i] = new StringBuffer();
      for(int j = 0; j < numTypes; ++j) {
        if(features[j] != null) {
          ++numCounted;
          if(featurePosition[j]!=0)
            this.featuresInLine[i].append(features[j]
                 + positionArrStr[j]+ConstantParameters.ITEMSEPARATOR);
          else           
            this.featuresInLine[i].append(features[j]
            + ConstantParameters.ITEMSEPARATOR);
        } else {
          if(featurePosition[j]!=0)
            this.featuresInLine[i].append(ConstantParameters.NAMENONFEATURE
              + positionArrStr[j]+ConstantParameters.ITEMSEPARATOR);
          else 
            this.featuresInLine[i].append(ConstantParameters.NAMENONFEATURE
              +ConstantParameters.ITEMSEPARATOR);
        }
        featuresCounted[i] += numCounted;
      }
    }// end of the loop on instances
  }
  /** Get the N-gram features from the GATE document. */
  public void gatedoc2NgramFeaturesArg(AnnotationSet annotations,
    String instanceType, java.util.List ngrams, boolean[][] isArgInRel, int initialPosition) {
    AnnotationSet anns = annotations.get(instanceType);
    ArrayList<Annotation>annotationArray = (anns == null || anns.isEmpty())
      ? new ArrayList<Annotation>()
      : new ArrayList<Annotation>(anns);
    Collections.sort(annotationArray, new OffsetComparator());
    int numInstances0 = annotationArray.size();
    int numNgrams = ngrams.size();
    // For each ngram
    for(int i1 = 0; i1 < numNgrams; ++i1) {
      Ngram ngram = (Ngram)ngrams.get(i1);
      String nameOfNgram = ngram.getName();
      int ngramPosition = ngram.position;
      if(ngramPosition>=0) ngramPosition += initialPosition;
      else ngramPosition -= initialPosition;
      String positionStr = obtainPositionStr(ngramPosition);
      featuresName.append(nameOfNgram + ConstantParameters.ITEMSEPARATOR);
      int consNum= ngram.getConsnum();
      //String typeGateNgram = (ngram.getTypessGate())[0];
      String [] typeGateNgram = new String[consNum];
      String [] featureGateNgram = new String[consNum];
      for(int j=0; j<consNum; ++j) {
        typeGateNgram[j] = (ngram.getTypessGate())[j];
        featureGateNgram[j] = (ngram.getFeaturesGate())[j];
      }
      AnnotationSet [] annsArray = new AnnotationSet[consNum];
      for(int j=0; j<consNum; ++j) {
        annsArray[j] = (AnnotationSet)annotations.get(typeGateNgram[j]);
      }
      for(int i = 0; i < numInstances0; ++i) {
        Annotation annToken = annotationArray.get(i);
        Long tokenStartOffset = annToken.getStartNode().getOffset();
        Long tokenEndOffset = annToken.getEndNode().getOffset();
        AnnotationSet annsNgramType = annsArray[0].get(tokenStartOffset, tokenEndOffset);
        String[] features = obtainNgramFeatures(annsNgramType,
          featureGateNgram[0]);
        int numFeats = features.length;
        int number = ngram.getNumber();
        if(numFeats>=number) {
        for(int j = 1; j < consNum; j++) {
          String[] features1;
          if(typeGateNgram[j].equals(typeGateNgram[0]))
            features1 = obtainNgramFeatures(annsNgramType, featureGateNgram[j]);
          else features1 = obtainNgramFeaturesFromDifferentType(annsNgramType,
            annsArray[j].get(tokenStartOffset, tokenEndOffset), 
            featureGateNgram[j]);
          for(int j1 = 0; j1 < features.length; ++j1)
            features[j1] = features[j1] + "_" + features1[j1];
        }
        // get the ngram features
        
        StringBuffer[] featuresNgram = new StringBuffer[numFeats - number + 1];
        for(int j = 0; j < featuresNgram.length; ++j)
          featuresNgram[j] = new StringBuffer();
        for(int j = 0; j < number; ++j) {
          for(int j1 = j; j1 < numFeats - number + 1 + j; ++j1) {
            featuresNgram[j1 - j].append(features[j1]
              + NLPFeaturesList.SYMBOLNGARM);
          }
        }
        Hashtable<String,Integer>ngramTerms = new Hashtable<String,Integer>();
        for(int j = 0; j < featuresNgram.length; ++j)
          if(!ngramTerms.containsKey(featuresNgram[j].toString()))
            ngramTerms.put(featuresNgram[j].toString(), new Integer(1));
          else ngramTerms.put(featuresNgram[j].toString(),
            new Integer(ngramTerms
              .get(featuresNgram[j].toString()).intValue() + 1));
        List<String>keys = new ArrayList<String>(ngramTerms.keySet());
        Collections.sort(keys);
        //Iterator iterator = keys.iterator();
        //while(iterator.hasNext()) {
        for(int iK=0; iK<keys.size(); ++iK) {
          //Object key = iterator.next();
          String key = keys.get(iK);
          //For each relation data with the current one as its argument
          for(int ii = 0; ii < numInstances; ++ii) {
            if(isArgInRel[i][ii]) {
              if(featuresInLine[ii] == null) 
                featuresInLine[ii] = new StringBuffer();
              if(ngramPosition != 0)
                this.featuresInLine[ii].append(obtainFeatureName(nameOfNgram, key
                  + NLPFeaturesList.SYMBOLNGARM + ngramTerms.get(key))
                  + positionStr + ConstantParameters.ITEMSEPARATOR);
              else 
                this.featuresInLine[ii].append(obtainFeatureName(nameOfNgram, key
                  + NLPFeaturesList.SYMBOLNGARM + ngramTerms.get(key))
                  + ConstantParameters.ITEMSEPARATOR);
              ++featuresCounted[ii];
            }
          }//for each instance
        }
      }
      }// end of the loop on instances
    } // end of the loop on number of ngrams
  }

  /** Get the NLP feature for the argument feature of relation data. */
  public void gatedoc2NLPFeaturesArg(AnnotationSet annotations,
    String instanceType, String[] typesGate, String[] featuresGate,
    String[] namesGate, int[] featurePosition, boolean[][] isArgInRel, int initialPosition) {
    int numTypes = typesGate.length;
    this.totalnumTypes += numTypes;
    for(int i = 0; i < numTypes; ++i) {
      this.featuresName.append(namesGate[i] + ConstantParameters.ITEMSEPARATOR);
    }
    int [] positionNum = new int[numTypes];
    String[] positionArrStr = new String[numTypes];
    for(int i = 0; i < numTypes; ++i) {
      if(featurePosition[i]>=0)
        positionNum[i] = featurePosition[i] + initialPosition;
      else positionNum[i] = featurePosition[i] - initialPosition;
      if(positionNum[i] != 0)
        positionArrStr[i] = obtainPositionStr(featurePosition[i]);
    }
    AnnotationSet anns = annotations.get(instanceType);
    ArrayList annotationArray = (anns == null || anns.isEmpty())
      ? new ArrayList()
      : new ArrayList(anns);
    Collections.sort(annotationArray, new OffsetComparator());
    String[] features = new String[numTypes];
    int numInstances0 = annotationArray.size();
    AnnotationSet [] annsArray = new AnnotationSet[numTypes];
    for(int j=0; j<numTypes; ++j) {
      annsArray[j] = (AnnotationSet)annotations
      .get(typesGate[j]);
    }
    for(int i = 0; i < numInstances0; ++i) {
      // for class
      Annotation annToken;
      for(int j = 0; j < numTypes; j++) {
        // for each attribute in different positions, get the token in
        // the corresponding position
        if(featurePosition[j] == 0)
          annToken = (Annotation)annotationArray.get(i);
        else if((featurePosition[j] < 0 && i + featurePosition[j] >= 0)
          || (featurePosition[j] > 0 && i + featurePosition[j] < numInstances0))
          annToken = (Annotation)annotationArray.get(i + featurePosition[j]);
        else continue;
        if(typesGate[j].equals(instanceType)) {
          features[j] = (String)annToken.getFeatures().get(featuresGate[j]);// types[i];
          // //(String)annToken.getFeatures().get(attr.getFeature());
        } else { // if not belongs to token
          Long tokenStartOffset = annToken.getStartNode().getOffset();
          Long tokenEndOffset = annToken.getEndNode().getOffset();
          features[j] = obtainAnnotationForTypeAndFeature(annsArray[j], featuresGate[j], tokenStartOffset,
            tokenEndOffset);
        }
        // put the name into the feature name
        if(features[j] != null) {
          features[j] = features[j].trim().replaceAll(
            ConstantParameters.ITEMSEPARATOR,
            ConstantParameters.ITEMSEPREPLACEMENT);
          features[j] = obtainFeatureName(namesGate[j], features[j]);
        }
      }// end of the loop on the types
      // For each relation data with the current one as its argument
      for(int ii = 0; ii < numInstances; ++ii) {
        if(isArgInRel[i][ii]) {
          int numCounted = 0;
          if(featuresInLine[ii] == null)
            featuresInLine[ii] = new StringBuffer();
          for(int j = 0; j < numTypes; ++j) {
            if(features[j] instanceof String) {
              ++numCounted;
              if(positionNum[j]!=0)
                this.featuresInLine[ii].append(features[j]
                     + positionArrStr[j]+ ConstantParameters.ITEMSEPARATOR);
              else 
                this.featuresInLine[ii].append(features[j]
                     + ConstantParameters.ITEMSEPARATOR);
            } else 
              if(positionNum[j]!=0)
                this.featuresInLine[ii].append(ConstantParameters.NAMENONFEATURE
                   + positionArrStr[j]+ConstantParameters.ITEMSEPARATOR);
              else
                this.featuresInLine[ii].append(ConstantParameters.NAMENONFEATURE
                + ConstantParameters.ITEMSEPARATOR);
          }
          featuresCounted[ii] += numCounted;
        }
      }
    }// end of the loop on instances
  }

  /** Match the argument instance with the relation instance. */
  boolean[][] matchArgInstanceWithInst(AnnotationSet annotations,
    String relInstanceType, String instanceType, String relArgF, String argF) {
    // Get the intance array
    AnnotationSet anns = annotations.get(instanceType);
    ArrayList annotationArray = (anns == null || anns.isEmpty())
      ? new ArrayList()
      : new ArrayList(anns);
    Collections.sort(annotationArray, new OffsetComparator());
    // Get the relation intance array
    AnnotationSet relAnns = annotations.get(relInstanceType);
    ArrayList relAnnotationArray = (relAnns == null || relAnns.isEmpty())
      ? new ArrayList()
      : new ArrayList(relAnns);
    Collections.sort(relAnnotationArray, new OffsetComparator());
    // Assign the match
    boolean[][] isArgInRel = new boolean[annotationArray.size()][relAnnotationArray
      .size()];
    for(int i = 0; i < annotationArray.size(); ++i) {
      Annotation ann = (Annotation)annotationArray.get(i);
      String argV = ann.getFeatures().get(argF).toString();
      for(int ii = 0; ii < relAnnotationArray.size(); ++ii) {
        String argRelV = ((Annotation)relAnnotationArray.get(ii)).getFeatures()
          .get(relArgF).toString();
        if(argV.equals(argRelV))
          isArgInRel[i][ii] = true;
        else isArgInRel[i][ii] = false;
      }
    }
    return isArgInRel;
  }

  /** Get the annotation with different type from the instance. */
  String obtainAnnotationForTypeAndFeature(AnnotationSet singleAnnSet,
    String gateFeature, Long tokenStartOffset, Long tokenEndOffset) {
    if(singleAnnSet instanceof AnnotationSet) {
      AnnotationSet coverAnnSet = (AnnotationSet)singleAnnSet.get(
        tokenStartOffset, tokenEndOffset);
      Iterator overlappingIterator = coverAnnSet.iterator();
      if(overlappingIterator.hasNext()) {
        Annotation superannotation = (Annotation)overlappingIterator.next();
        return (String)superannotation.getFeatures().get(gateFeature);
      }
    }
    return null;
  }

  /**
   * Get the annotation with different type from the instance for relation
   * learning.
   */
  String obtainAnnotationForTypeAndFeatureRel(String arg1V, String arg2V,
    AnnotationSet singleAnnSet, String gateFeature, String arg1F, String arg2F) {
    if(singleAnnSet instanceof AnnotationSet) {
      Iterator overlappingIterator = singleAnnSet.iterator();
      if(overlappingIterator.hasNext()) {
        Annotation superannotation = (Annotation)overlappingIterator.next();
        FeatureMap feat0 = superannotation.getFeatures();
        if(arg1V.equals(feat0.get(arg1F)) && arg2V.equals(feat0.get(arg2F))) {
          String feat = feat0.get(gateFeature).toString();
          return feat;
        }
      }
    }
    return null;
  }

  /**
   * Get the Attribute-Rel features from annotations for relation learning.
   */
  public void gatedoc2NLPFeaturesRel(AnnotationSet annotations,
    String instanceType, String arg1Inst, String arg2Inst, String[] typesGate,
    String[] featuresGate, String[] namesGate, String[] arg1s, String[] arg2s,
    int[] featurePosition) {
    int numTypes = typesGate.length;
    this.totalnumTypes += numTypes;
    for(int i = 0; i < numTypes; ++i) {
      this.featuresName.append(namesGate[i] + ConstantParameters.ITEMSEPARATOR);
    }
    String [] strPosition = new String[numTypes];
    for(int i=0; i<numTypes; ++i) {
      if(featurePosition[i]!=0)
        strPosition[i] = obtainPositionStr(featurePosition[i]);
    }
    AnnotationSet anns = annotations.get(instanceType);
    ArrayList annotationArray = (anns == null || anns.isEmpty())
      ? new ArrayList()
      : new ArrayList(anns);
    Collections.sort(annotationArray, new OffsetComparator());
    if(numInstances != annotationArray.size()) {
      System.out.println("!!Warning: the number of instances "
        + new Integer(numInstances) + " in the document " + docId
        + " is not right!!!");
      return;
    }
    AnnotationSet [] annsArray = new AnnotationSet[numTypes];
    for(int j=0; j<numTypes; ++j) {
      annsArray[j] = (AnnotationSet)annotations
      .get(typesGate[j]);
    }
    String[] features = new String[numTypes];
    for(int i = 0; i < numInstances; ++i) {
      // for class
      Annotation annToken;
      for(int j = 0; j < numTypes; j++) {
        // for each attribute in different positions, get the token in
        // the corresponding position
        if(featurePosition[j] == 0)
          annToken = (Annotation)annotationArray.get(i);
        else if((featurePosition[j] < 0 && i + featurePosition[j] >= 0)
          || (featurePosition[j] > 0 && i + featurePosition[j] < numInstances))
          annToken = (Annotation)annotationArray.get(i + featurePosition[j]);
        else continue;
        FeatureMap feat = annToken.getFeatures();
        String arg1Value = feat.get(arg1s[j]).toString();
        String arg2Value = feat.get(arg2s[j]).toString();
        if(typesGate[j].equals(instanceType)) {
          if(arg1Value.equals(feat.get(arg1Inst))
            && arg2Value.equals(feat.get(arg2Inst)))
            features[j] = feat.get(featuresGate[j]).toString();// types[i];
          // //(String)annToken.getFeatures().get(attr.getFeature());
        } else { // if not belongs to token
          features[j] = obtainAnnotationForTypeAndFeatureRel(arg1Value,
            arg2Value, annsArray[j], featuresGate[j],
            arg1s[j], arg2s[j]);
        }
        // put the name into the feature name
        if(features[j] != null) {
          features[j] = features[j].trim().replaceAll(
            ConstantParameters.ITEMSEPARATOR,
            ConstantParameters.ITEMSEPREPLACEMENT);
          features[j] = obtainFeatureName(namesGate[j], features[j]);
        }
      }// end of the loop on the types
      int numCounted = 0;
      if(featuresInLine[i] == null) featuresInLine[i] = new StringBuffer();
      for(int j = 0; j < numTypes; ++j)
        if(features[j] instanceof String) {
          ++numCounted;
          if(featurePosition[j]!=0)
            this.featuresInLine[i].append(features[j]
                +strPosition[j]+ConstantParameters.ITEMSEPARATOR);
          else
            this.featuresInLine[i].append(features[j]
               + ConstantParameters.ITEMSEPARATOR);
        } else 
          if(featurePosition[j]!=0)
            this.featuresInLine[i].append(ConstantParameters.NAMENONFEATURE
              +strPosition[j] + ConstantParameters.ITEMSEPARATOR);
          else
            this.featuresInLine[i].append(ConstantParameters.NAMENONFEATURE
              + ConstantParameters.ITEMSEPARATOR);
      featuresCounted[i] = numCounted;
    }// end of the loop on instances
  }

  /** Get the label for the relation learning. */
  public void gatedoc2LabelsCompleteRel(AnnotationSet annotations,
    String instanceType, String arg1Inst, String arg2Inst, String classType,
    String classFeature, String arg1C, String arg2C) {
    AnnotationSet anns = annotations.get(instanceType);
    ArrayList annotationArray = (anns == null || anns.isEmpty())
      ? new ArrayList()
      : new ArrayList(anns);
    Collections.sort(annotationArray, new OffsetComparator());
    if(numInstances != annotationArray.size()) {
      System.out.println("!!Warning: the number of instances "
        + new Integer(numInstances) + " in the document " + docId
        + " is not right!!!");
      return;
    }
    // For each of entity
    AnnotationSet annsEntity = annotations.get(classType);
    for(Object obj : annsEntity) {
      Annotation annEntity = (Annotation)obj;
      if(annEntity.getFeatures().get(classFeature) == null) continue;
      String featName = annEntity.getFeatures().get(classFeature).toString();
      featName = featName.trim();
      featName = featName.replaceAll(ConstantParameters.SUFFIXSTARTTOKEN,
        ConstantParameters.SUFFIXSTARTTOKEN + "_");
      // Get the values of the entity args
      String arg1CV = annEntity.getFeatures().get(arg1C).toString();
      String arg2CV = annEntity.getFeatures().get(arg2C).toString();
      boolean isStart = true;
      for(int i = 0; i < numInstances; ++i) {
        Annotation annToken = (Annotation)annotationArray.get(i);
        FeatureMap feats = annToken.getFeatures();
        if(arg1CV.equals(feats.get(arg1Inst))
          && arg2CV.equals(feats.get(arg2Inst))) {
          String featName0 = featName;
          if(isStart) {
            featName0 += ConstantParameters.SUFFIXSTARTTOKEN;
            isStart = false;
          }
          if(featName0.length() > 0) {
            if(this.classNames[i] instanceof String)
              this.classNames[i] += ConstantParameters.ITEMSEPARATOR
                + featName0;
            else this.classNames[i] = featName0;
          }
        }
      }
    }
  }

  /** Write the NLP data into a file. */
  public void writeNLPFeaturesToFile(BufferedWriter out, String docId,
    int docIndex, int[] featurePosition) {
    if(LogService.minVerbosityLevel > 1)
      System.out.println("number=" + new Integer(numInstances));
    try {
      if(docIndex == 0) {
        StringBuffer sline = new StringBuffer("Class(es)");
        String[] featNs = this.featuresName.toString().split(
          ConstantParameters.ITEMSEPARATOR);
        for(int i = 0; i < featNs.length; ++i)
          if(featurePosition.length > i)
            sline.append(ConstantParameters.ITEMSEPARATOR + featNs[i] + "("
              + featurePosition[i] + ")");
          else sline.append(ConstantParameters.ITEMSEPARATOR + featNs[i]);
        out.write(sline.toString());
        out.newLine();
      }
      out.write(new Integer(docIndex) + ConstantParameters.ITEMSEPARATOR + 
        docId + ConstantParameters.ITEMSEPARATOR
        + new Integer(numInstances));
      out.newLine();
      for(int i = 0; i < numInstances; ++i) {
        if(classNames[i] instanceof String) {
          int num = classNames[i].split(ConstantParameters.ITEMSEPARATOR).length;
          out.write(num + ConstantParameters.ITEMSEPARATOR + classNames[i]
            + ConstantParameters.ITEMSEPARATOR
            + this.featuresInLine[i].toString().trim());
        } else out.write("0" + ConstantParameters.ITEMSEPARATOR
          + this.featuresInLine[i].toString().trim());
        out.newLine();
      }
    } catch(IOException e) {
      System.out.println("Error occured in writing the NLP data to a file!");
    }
  }
  
  /** Read the NLP data of one document from the NLP feature file. */
  public void readNLPFeaturesFromFile(BufferedReader in) {
    try {
      String [] lineItems = in.readLine().split(ConstantParameters.ITEMSEPARATOR);
      numInstances = Integer.parseInt(lineItems[2]);
      docId = lineItems[1];
      featuresInLine = new StringBuffer[numInstances];
      classNames = new String[numInstances];
      int num;
      for(int i=0; i<numInstances; ++i) {
        String [] lineItems1 = in.readLine().split(ConstantParameters.ITEMSEPARATOR);
        num = Integer.parseInt(lineItems1[0]);
        if(num>0) {
          StringBuffer classNs = new StringBuffer();
          for(int j=1; j<num; ++j)
            classNs.append(lineItems1[j]+ConstantParameters.ITEMSEPARATOR);
          classNs.append(lineItems1[num]);
          classNames[i] = classNs.toString();
        }
        featuresInLine[i] = new StringBuffer();
        if(num+1<lineItems1.length)
          featuresInLine[i].append(lineItems1[num+1]);
        for(int j=num+2; j<lineItems1.length; ++j)
          featuresInLine[i].append(ConstantParameters.ITEMSEPARATOR+lineItems1[j]);
      }
    } catch(IOException e) {
      System.out.println("**Error occured in reading the NLP data from file for converting to FVs!");
    }
    
  }

  public void setDocId(String docId) {
    this.docId = new String(docId);
  }

  public String getDocId() {
    return this.docId;
  }

  /** Put the type and feature together. */
  static String obtainFeatureName(String type, String feat) {
    return ConstantParameters.ITEMSEPREPLACEMENT + type
      + ConstantParameters.ITEMSEPREPLACEMENT + feat;
  }
}