/* * NLPFeaturesList.java * * Yaoyong Li 22/03/2007 * * $Id: NLPFeaturesList.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $ */ package gate.learning; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; /** * NLP feature list. Read it from a file, update it using the new documents, and * write back into the file. */ public class NLPFeaturesList { /** the features with ids, can be accessed by multiple threads. */ public Hashtable featuresList = null; /** * Document frequence of each term, useful for document or passage * classification */ public Hashtable idfFeatures = null; /** Total number of documents used for forming the list. */ int totalNumDocs; /** The unique sysmbol used for the N-gram feature. */ public final static String SYMBOLNGARM = "<>"; /** Constructor, get the two hashtables. */ public NLPFeaturesList() { featuresList = new Hashtable(); idfFeatures = new Hashtable(); totalNumDocs = 0; } /** Loading the list from a file. */ public void loadFromFile(File parentDir, String filename, String tcode) { File fileFeaturesList = new File(parentDir, filename); if(fileFeaturesList.exists()) { try { BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream (fileFeaturesList), tcode)); // featuresList = new Hashtable(); String line; if((line = in.readLine()) != null) totalNumDocs = (new Integer(line.substring(line.lastIndexOf("=") + 1))) .intValue(); while((line = in.readLine()) != null) { String[] st = line.split(" "); featuresList.put(st[0], st[1]); idfFeatures.put(st[0], st[2]); } in.close(); } catch(IOException e) { } } else { if(LogService.minVerbosityLevel > 0) System.out.println("No feature list file in initialisation phrase."); } } /** Write back the list into the file, with updated information. */ public void writeListIntoFile(File parentDir, String filename, String tcode) { File fileFeaturesList = new File(parentDir, filename); if(LogService.minVerbosityLevel > 1) System.out.println("Lengh of List = " + featuresList.size()); try { PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream( fileFeaturesList), tcode)); // for the total number of docs out.println("totalNumDocs=" + totalNumDocs); List keys = new ArrayList(featuresList.keySet()); Collections.sort(keys); // write the features list into the output file Iterator iterator = keys.iterator(); while(iterator.hasNext()) { Object key = iterator.next(); out.println(key + " " + featuresList.get(key) + " " + idfFeatures.get(key)); //System.out.println("*"+key+ "* " + featuresList.get(key)); } out.close(); } catch(IOException e) { } } /** Update the NLP features from new documents. */ public void addFeaturesFromDoc(NLPFeaturesOfDoc fd) { long size = featuresList.size(); for(int i = 0; i < fd.numInstances; ++i) { String[] features = fd.featuresInLine[i].toString().trim().split( ConstantParameters.ITEMSEPARATOR); for(int j = 0; j < features.length; ++j) { if(features[j] != null && Pattern.matches((".+\\[[-0-9]+\\]$"), features[j])) { int ind = features[j].lastIndexOf('['); features[j] = features[j].substring(0,ind); } //if the feature is an empty, don't count it as a feature at all. if(features[j].equals("")) continue; String feat = features[j]; if(feat.contains(SYMBOLNGARM)) feat = feat.substring(0, feat.lastIndexOf(SYMBOLNGARM)); if(!feat.equals(ConstantParameters.NAMENONFEATURE)) { // If the featureName is not in the feature list if(size < ConstantParameters.MAXIMUMFEATURES) { if(!featuresList.containsKey(feat)) { ++size; // features is from 1 (not zero), in the SVM-light // format featuresList.put(feat, new Long(size)); idfFeatures.put(feat, new Long(1)); } else { idfFeatures.put(feat, new Long((new Long(idfFeatures.get(feat) .toString())).longValue() + 1)); } } else { System.out .println("There are more NLP features from the training docuemnts"); System.out.println(" than the pre-defined maximal number" + new Long(ConstantParameters.MAXIMUMFEATURES)); return; } } } }// end of the loop on the instances // update the total number of docs totalNumDocs += fd.numInstances; } /** Clear the label list object for another run in evaluation. */ public void clearAllData() { featuresList.clear(); idfFeatures.clear(); } /** convert the NLP list into Ngram language model and write it into a file. */ public void writeToLM(File parentDir, String filename, int nGram) { File ngramList = new File(parentDir, filename); if(LogService.minVerbosityLevel > 1) System.out.println("Lengh of List = " + featuresList.size()); try { PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream( ngramList), "UTF-8")); // for the total number of docs //out.println("totalNumDocs=" + new Integer(totalNumDocs)); out.println("## The following "+nGram+"-gram were obtained from " + totalNumDocs+ " documents or examples"); List keys = new ArrayList(featuresList.keySet()); //Collections.sort(keys); int numT = keys.size(); float [] freqs = new float[numT]; for(int i=0; i<numT; ++i) freqs[i] = Float.parseFloat(idfFeatures.get(keys.get(i)).toString()); int [] indexSort = new int[numT]; LightWeightLearningApi.sortFloatAscIndex(freqs, indexSort, numT, numT); // write the features list into the output file //Iterator iterator = keys.iterator(); //while(iterator.hasNext()) { for(int i=0; i<numT; ++i) { Object key = keys.get(indexSort[i]); String str = key.toString(); if(str.contains("<>")) { //if it is a ngram feature str = str.substring(str.indexOf("_",1)+1, str.lastIndexOf("<>")); out.println(str + " " + (int)freqs[indexSort[i]]); //System.out.println("*"+key+ "* " + featuresList.get(key)); } } out.close(); } catch(IOException e) { } } }