GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Learning/src/gate/learning/NLPFeaturesList.java

/*
 *  NLPFeaturesList.java
 * 
 *  Yaoyong Li 22/03/2007
 *
 *  $Id: NLPFeaturesList.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
 */
package gate.learning;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

/**
 * NLP feature list. Read it from a file, update it using the new documents, and
 * write back into the file.
 */
public class NLPFeaturesList {
  /** the features with ids, can be accessed by multiple threads. */
  public Hashtable featuresList = null;
  /**
   * Document frequence of each term, useful for document or passage
   * classification
   */
  public Hashtable idfFeatures = null;
  /** Total number of documents used for forming the list. */
  int totalNumDocs;
  /** The unique sysmbol used for the N-gram feature. */
  public final static String SYMBOLNGARM = "<>";

  /** Constructor, get the two hashtables. */
  public NLPFeaturesList() {
    featuresList = new Hashtable();
    idfFeatures = new Hashtable();
    totalNumDocs = 0;
  }

  /** Loading the list from a file. */
  public void loadFromFile(File parentDir, String filename, String tcode) {
    File fileFeaturesList = new File(parentDir, filename);
    if(fileFeaturesList.exists()) {
      try {
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream
          (fileFeaturesList), tcode));
        // featuresList = new Hashtable();
        String line;
        if((line = in.readLine()) != null)
          totalNumDocs = (new Integer(line.substring(line.lastIndexOf("=") + 1)))
            .intValue();
        while((line = in.readLine()) != null) {
          String[] st = line.split(" ");
          featuresList.put(st[0], st[1]);
          idfFeatures.put(st[0], st[2]);
        }
        in.close();
      } catch(IOException e) {
      }
    } else {
      if(LogService.minVerbosityLevel > 0)
        System.out.println("No feature list file in initialisation phrase.");
    }
  }

  /** Write back the list into the file, with updated information. */
  public void writeListIntoFile(File parentDir, String filename, String tcode) {
    File fileFeaturesList = new File(parentDir, filename);
    if(LogService.minVerbosityLevel > 1)
      System.out.println("Lengh of List = " + featuresList.size());
    try {
      PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(
        fileFeaturesList), tcode));
      // for the total number of docs
      out.println("totalNumDocs=" + totalNumDocs);
      List keys = new ArrayList(featuresList.keySet());
      Collections.sort(keys);
      // write the features list into the output file
      Iterator iterator = keys.iterator();
      while(iterator.hasNext()) {
        Object key = iterator.next();
        out.println(key + " " + featuresList.get(key) + " "
          + idfFeatures.get(key));
         //System.out.println("*"+key+ "* " + featuresList.get(key));
      }
      out.close();
    } catch(IOException e) {
    }
  }

  /** Update the NLP features from new documents. */
  public void addFeaturesFromDoc(NLPFeaturesOfDoc fd) {
    long size = featuresList.size();
    for(int i = 0; i < fd.numInstances; ++i) {
      String[] features = fd.featuresInLine[i].toString().trim().split(
        ConstantParameters.ITEMSEPARATOR);
      for(int j = 0; j < features.length; ++j) {
        if(features[j] != null && Pattern.matches((".+\\[[-0-9]+\\]$"), features[j])) {
          int ind = features[j].lastIndexOf('[');
          features[j] = features[j].substring(0,ind);
        }
        //if the feature is an empty, don't count it as a feature at all. 
        if(features[j].equals(""))
          continue;
        String feat = features[j];
        if(feat.contains(SYMBOLNGARM))
          feat = feat.substring(0, feat.lastIndexOf(SYMBOLNGARM));
        if(!feat.equals(ConstantParameters.NAMENONFEATURE)) {
          // If the featureName is not in the feature list
          if(size < ConstantParameters.MAXIMUMFEATURES) {
            if(!featuresList.containsKey(feat)) {
              ++size;
              // features is from 1 (not zero), in the SVM-light
              // format
              featuresList.put(feat, new Long(size));
              idfFeatures.put(feat, new Long(1));
            } else {
              idfFeatures.put(feat, new Long((new Long(idfFeatures.get(feat)
                .toString())).longValue() + 1));
            }
          } else {
            System.out
              .println("There are more NLP features from the training docuemnts");
            System.out.println(" than the pre-defined maximal number"
              + new Long(ConstantParameters.MAXIMUMFEATURES));
            return;
          }
        }
      }
    }// end of the loop on the instances
    // update the total number of docs
    totalNumDocs += fd.numInstances;
  }
  
  /** Clear the label list object for another run in evaluation. */
  public void clearAllData() {
    featuresList.clear();
    idfFeatures.clear();
  }
  /** convert the NLP list into Ngram language model and write it into a file. */
 public void writeToLM(File parentDir, String filename, int nGram) {
   File ngramList = new File(parentDir, filename);
   if(LogService.minVerbosityLevel > 1)
     System.out.println("Lengh of List = " + featuresList.size());
   try {
     PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(
       ngramList), "UTF-8"));
     // for the total number of docs
     //out.println("totalNumDocs=" + new Integer(totalNumDocs));
     out.println("## The following "+nGram+"-gram were obtained from " + totalNumDocs+ " documents or examples");
     List keys = new ArrayList(featuresList.keySet());
     //Collections.sort(keys);
     int numT = keys.size();
     float [] freqs = new float[numT];
     for(int i=0; i<numT; ++i)
       freqs[i] = Float.parseFloat(idfFeatures.get(keys.get(i)).toString());
     int [] indexSort = new int[numT];
     LightWeightLearningApi.sortFloatAscIndex(freqs, indexSort, numT, numT);
     // write the features list into the output file
     //Iterator iterator = keys.iterator();
     //while(iterator.hasNext()) {
     for(int i=0; i<numT; ++i) {
       Object key = keys.get(indexSort[i]);
       String str = key.toString();
       if(str.contains("<>")) { //if it is a ngram feature
         str = str.substring(str.indexOf("_",1)+1, str.lastIndexOf("<>"));
         out.println(str + " " + (int)freqs[indexSort[i]]);
        //System.out.println("*"+key+ "* " + featuresList.get(key));
       }
     }
     out.close();
   } catch(IOException e) {
   }
 }
}