GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Learning/src/gate/learning/DocFeatureVectors.java

/*
 *  DocFeatureVectors.java
 * 
 *  Yaoyong Li 22/03/2007
 *
 *  $Id: DocFeatureVectors.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
 */
package gate.learning;

import gate.util.GateException;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Hashtable;
import java.util.List;
import java.util.regex.Pattern;

/**
 * Convert the NLP features into a sparse feature vector for each instance of
 * one document.
 */
public class DocFeatureVectors {
  /** Document ID. */
  public String docId = null;
  /** Number of instance in the document. */
  public int numInstances;
  /**
   * Array containing all sparse feature vectors of instances in the documents.
   */
  public SparseFeatureVector[] fvs;
  /** Default value of one component of feature vector. */
  final static float DEFAULTVALUE = 1.0f;

  /** Constructor in trival case. */
  public DocFeatureVectors() {
    numInstances = 0;
  }

  /**
   * The main method for obtaining the sparse feature vector and label(s) from
   * the NLP features of each instance in the documents.
   */
  public void obtainFVsFromNLPFeatures(NLPFeaturesOfDoc nlpDoc,
    NLPFeaturesList featList, int[] featurePosition, int maxNegPosition,
    int numDocs, float ngramWeight, int valueType) {
    numInstances = nlpDoc.numInstances;
    docId = new String(nlpDoc.getDocId());
    fvs = new SparseFeatureVector[numInstances];
    // For each istance
    for(int i = 0; i < numInstances; ++i) {
      Hashtable indexValues = new Hashtable();
      int n = 0;
      String[] feat = nlpDoc.featuresInLine[i].toString().split(
        ConstantParameters.ITEMSEPARATOR);
      // Some variables for normalising the feature values of the same ngram
      // boolean sameNgram=true;
      int prevPosition = -99999;
      float[] tempVals = new float[9999];
      long[] tempInds = new long[9999];
      int tempSize = 0;
      int positionCurr = 0;
      for(int j = 0; j < feat.length; ++j) {
        // First get the position information for the current NLP feature
        positionCurr = 0;
        if(feat[j] != null && Pattern.matches((".+\\[[-0-9]+\\]$"), feat[j])) {
          int ind = feat[j].lastIndexOf('[');
          String positionStr = feat[j].substring(ind + 1, feat[j].length() - 1);
          positionCurr = Integer.parseInt(positionStr);
          feat[j] = feat[j].substring(0, ind);
        }
        if(prevPosition != positionCurr && tempSize > 0) {
          double sum = 0.0;
          for(int ii = 0; ii < tempSize; ++ii)
            sum += tempVals[ii] * tempVals[ii];
          sum = Math.sqrt(sum);
          for(int ii = 0; ii < tempSize; ++ii) {
            tempVals[ii] /= sum;
            indexValues.put(new Long(tempInds[ii]), new Float(tempVals[ii]));
          }
          tempSize = 0;
        }
        String featCur = feat[j];
        String featVal = null;
        int kk = -3;
        if(featCur.contains(NLPFeaturesList.SYMBOLNGARM)) {
          kk = feat[j].lastIndexOf(NLPFeaturesList.SYMBOLNGARM);
          featCur = feat[j].substring(0, kk);
          featVal = feat[j].substring(kk + 2);
        }
        if(featCur.length() > 0) { // if there is any feature
          if(featList.featuresList.containsKey(featCur)) {
            if(featCur.contains(NLPFeaturesList.SYMBOLNGARM)) {
              int shiftNum = 0;
              if(positionCurr > 0)
                shiftNum = maxNegPosition + positionCurr;
              else shiftNum = -positionCurr;
              long featInd = Long.parseLong(featList.featuresList.get(featCur)
                .toString())
                + shiftNum * ConstantParameters.MAXIMUMFEATURES;
              double val = 0.0;
              switch(valueType){
                case 1: // for only the presence of Ngram in the sentence
                  val = 1.0;
                  break;
                case 2: // for tf representation
                  val = Long.parseLong(featVal);
                  break;
                case 3: // for tf*idf representation
                  val = (Long.parseLong(featVal) + 1)
                    * Math.log((double)numDocs
                      / (Long.parseLong(featList.idfFeatures.get(featCur)
                        .toString())));
                  break;
                default:
                  try {
                    throw new GateException(
                      "The value type for ngram is not defined!");
                  } catch(GateException e) {
                    e.printStackTrace();
                  }
              }
              // indexValues.put(featInd, new Float(val));
              tempInds[tempSize] = featInd;
              tempVals[tempSize] = (float)val;
              ++tempSize;
            } else {
              if(positionCurr == 0)
                indexValues.put(featList.featuresList.get(feat[j]), "1");
              else if(positionCurr < 0)
                indexValues.put(new Long((Long.parseLong(featList.featuresList
                  .get(feat[j]).toString()) - positionCurr
                  * ConstantParameters.MAXIMUMFEATURES)), new Float(-1.0
                  / (double)positionCurr));
              else indexValues.put(
                new Long((Long.parseLong(featList.featuresList.get(feat[j])
                  .toString()) + (positionCurr + maxNegPosition)
                  * ConstantParameters.MAXIMUMFEATURES)), new Float(
                  1.0 / (double)positionCurr));
            }
            ++n;
          }
        }
        prevPosition = positionCurr;
      } // end of the loop on the features of one instances
      // For the last ngram features
      if(tempSize > 0) {
        if(valueType == 3) {
          double sum = 0.0;
          for(int ii = 0; ii < tempSize; ++ii)
            sum += tempVals[ii] * tempVals[ii];
          sum = Math.sqrt(sum);
          for(int ii = 0; ii < tempSize; ++ii) {
            tempVals[ii] /= sum;
            tempVals[ii] *= ngramWeight;
            indexValues.put(new Long(tempInds[ii]), new Float(tempVals[ii]));
          }
        } else {
          for(int ii = 0; ii < tempSize; ++ii) {
            indexValues.put(new Long(tempInds[ii]), new Integer(
              (int)tempVals[ii]));
          }
        }
        tempSize = 0;
      }
      // if(LogService.minVerbosityLevel > 1)
      // if(n != nlpDoc.featuresCounted[i]) {
      // System.out.println("Error: the number of features (" + n
      // + ") is not the same as the number recorded ("
      // + nlpDoc.featuresCounted[i] + ")in document " + docId);
      // }
      // sort the indexes in ascending order
      List indexes = new ArrayList(indexValues.keySet());
      Collections.sort(indexes, new LongCompactor());
      // Iterator iterator = indexes.iterator();
      // n = 0;
      // while(iterator.hasNext()) {
      // Object key = iterator.next();
      // fvs[i].indexes[n] = ((Long)key).longValue();
      fvs[i] = new SparseFeatureVector(indexes.size());
      // for(int j=0; j<n; ++j) {
      for(int j = 0; j < indexes.size(); ++j) {
        // System.out.println(new Integer(j) +" index=*"+
        // indexes.get(j)+"*");
        fvs[i].nodes[j].index = Integer.parseInt(indexes.get(j).toString());
        // for the constant value 1
        // fvs[i].values[j] = DEFAULTVALUE;
        // for the tf or tf*idf value
        fvs[i].nodes[j].value = Double.parseDouble(indexValues.get(indexes.get(j))
          .toString());
      }
    } // end of the loop on the instances
  }

  /** A static class for comparing two long numbers. */
  public static class LongCompactor implements java.util.Comparator {
    public int compare(Object l1, Object l2) {
      // return (new Long((new Long(l1.toString()).longValue()- new
      // Long(l2.toString()).longValue()))).intValue();
      return (int)(Long.parseLong(l1.toString()) - Long
        .parseLong(l2.toString()));
    }
  }

  /** Read the feature vectors of a document from the feature vector file. */
  public void readDocFVFromFile(BufferedReader dataFile, int num,
    LabelsOfFeatureVectorDoc labelsDoc) {
    numInstances = num;
    fvs = new SparseFeatureVector[numInstances];
    labelsDoc.multiLabels = new LabelsOfFV[numInstances];
    try {
      String line;
      for(int i = 0; i < num; ++i) {
        line = dataFile.readLine();
        // System.out.println("i="+i+"line="+line);
        String[] items = line.split(ConstantParameters.ITEMSEPARATOR);
        // get the label from the line
        int iEndLabel;
        // get the multilabel directly
        iEndLabel = obtainMultiLabels(items, labelsDoc.multiLabels, i);
        // get the feature vector
        int len = items.length - iEndLabel;
        if(len == 0) {
          // If there is no feature vector, creat one for it
          fvs[i] = new SparseFeatureVector(1);
          fvs[i].nodes[0].index = 1;
          fvs[i].nodes[0].value = 0;
        } else {
          fvs[i] = new SparseFeatureVector(len);
          obtainFVs(items, iEndLabel, len, fvs[i]);
        }
      }
    } catch(IOException e) {
      e.printStackTrace();
    }
    return;
  }

  /** Get the multi label(s) from one line of feature vector. */
  private int obtainMultiLabels(String[] items, LabelsOfFV[] multiLabels, int i) {
    int num;
    int kk = 1;
    num = Integer.valueOf(items[kk++]);
    multiLabels[i] = new LabelsOfFV(num);
    if(num > 0) {
      multiLabels[i].labels = new int[num];
      for(int j = 0; j < num; ++j)
        multiLabels[i].labels[j] = Integer.valueOf(items[kk++]);
    }
    return kk;
  }

  /** Get the feature vector in parse format from a String arrays. */
  private void obtainFVs(String[] items, int iEndLabel, int len,
    SparseFeatureVector fv) {
    String[] indexValue;
    for(int i = 0; i < len; ++i) {
      indexValue = items[i + iEndLabel]
        .split(ConstantParameters.INDEXVALUESEPARATOR);
      if(indexValue.length <= 1) {
        System.out.println("i=" + i + " item=" + items[i + iEndLabel]);
      }
      fv.nodes[i].index = (new Integer(indexValue[0])).intValue();
      fv.nodes[i].value = (new Float(indexValue[1])).floatValue();
    }
    return;
  }

  /** Get number of instances in the document. */
  public int getNumInstances() {
    return numInstances;
  }

  /** Get the fv array. */
  public SparseFeatureVector[] getFvs() {
    return fvs;
  }
  
  /** Delete the fv array. */
  public void deleteFvs() {
    for(int i=0; i<fvs.length; ++i)
      fvs[i] = null;
    //fvs= null;
  }

  /** Set the DocID. */
  public void setDocID(String docI) {
    this.docId = docI;
  }

  /** Expand the feature vector to including the context tokens. */
  public void expandFV(int winSizeLeft, int winSizeRight) {
    SparseFeatureVector[] fvsExpand = new SparseFeatureVector[fvs.length];
    for(int i = 0; i < fvs.length; ++i) {
      int lenT0 = fvs[i].len;
      for(int j = -1; j >= -winSizeLeft; --j) {
        if(j + i >= 0) lenT0 += fvs[j + i].len;
      }
      for(int j = 1; j <= winSizeRight; ++j)
        if(j + i < fvs.length) lenT0 += fvs[j + i].len;
      fvsExpand[i] = new SparseFeatureVector(lenT0);
      // System.out.println("lent0="+lenT0);
      for(int j1 = 0; j1 < fvs[i].len; ++j1) {
        fvsExpand[i].nodes[j1].index = fvs[i].nodes[j1].index;
        fvsExpand[i].nodes[j1].value = fvs[i].nodes[j1].value;
      }
      int lenTotal = fvs[i].len;
      for(int j = -1; j >= -winSizeLeft; --j) {
        int kk = j + i;
        if(kk >= 0) {
          int gapLen = -j * (int)ConstantParameters.MAXIMUMFEATURES;
          for(int j1 = 0; j1 < fvs[kk].len; ++j1) {
            if(j1 + lenTotal >= lenT0)
              System.out.println("i=" + i + ", j=" + j + ",j1=" + j1
                + ", newlen=" + lenTotal);
            fvsExpand[i].nodes[j1 + lenTotal].index = fvs[kk].nodes[j1].index + gapLen;
            fvsExpand[i].nodes[j1 + lenTotal].value = fvs[kk].nodes[j1].value / (-j);
          }
          lenTotal += fvs[kk].len;
        }
      }
      for(int j = 1; j <= winSizeRight; ++j) {
        int kk = j + i;
        if(kk < fvs.length) {
          int gapLen = (j + winSizeLeft)
            * (int)ConstantParameters.MAXIMUMFEATURES;
          for(int j1 = 0; j1 < fvs[kk].len; ++j1) {
            fvsExpand[i].nodes[j1 + lenTotal].index = fvs[kk].nodes[j1].index + gapLen;
            fvsExpand[i].nodes[j1 + lenTotal].value = fvs[kk].nodes[j1].value / j;
          }
          lenTotal += fvs[kk].len;
        }
      }
    }// end of the loop for each fv
    fvs = fvsExpand;
  }
  
  /** Write the FVs of one document into file. */
  public void addDocFVsToFile(int index, BufferedWriter out, int[] labels) {
    try {
      out.write(new Integer(index) + ConstantParameters.ITEMSEPARATOR
        + new Integer(numInstances) + ConstantParameters.ITEMSEPARATOR
        + docId);
      out.newLine();
      for(int i = 0; i < numInstances; ++i) {
        StringBuffer line = new StringBuffer();
        line.append(new Integer(i + 1) + ConstantParameters.ITEMSEPARATOR
          + new Integer(labels[i]));
        for(int j = 0; j < fvs[i].len; ++j)
          line.append(ConstantParameters.ITEMSEPARATOR
            + fvs[i].nodes[j].index + ConstantParameters.INDEXVALUESEPARATOR
            + fvs[i].nodes[j].value);
        out.write(line.toString());
        out.newLine();
      }
    } catch(IOException e) {
    }
  }

  /** Write the FVs with labels of one document into file. */
  public void addDocFVsMultiLabelToFile(int index, BufferedWriter out,
    LabelsOfFV[] multiLabels) {
    try {
      out.write(new Integer(index) + ConstantParameters.ITEMSEPARATOR
        + new Integer(numInstances) + ConstantParameters.ITEMSEPARATOR
        + docId);
      out.newLine();
      for(int i = 0; i < numInstances; ++i) {
        StringBuffer line = new StringBuffer();
        line.append(new Integer(i + 1) + ConstantParameters.ITEMSEPARATOR
          + multiLabels[i].num);
        for(int j = 0; j < multiLabels[i].num; ++j)
          line.append(ConstantParameters.ITEMSEPARATOR
            + multiLabels[i].labels[j]);
        for(int j = 0; j < fvs[i].len; ++j)
          line.append(ConstantParameters.ITEMSEPARATOR
            + fvs[i].nodes[j].index + ConstantParameters.INDEXVALUESEPARATOR
            + fvs[i].nodes[j].value);
        out.write(line.toString());
        out.newLine();
      }
    } catch(IOException e) {
    }
  }

}