/* * DocFeatureVectors.java * * Yaoyong Li 22/03/2007 * * $Id: DocFeatureVectors.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $ */ package gate.learning; import gate.util.GateException; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Hashtable; import java.util.List; import java.util.regex.Pattern; /** * Convert the NLP features into a sparse feature vector for each instance of * one document. */ public class DocFeatureVectors { /** Document ID. */ public String docId = null; /** Number of instance in the document. */ public int numInstances; /** * Array containing all sparse feature vectors of instances in the documents. */ public SparseFeatureVector[] fvs; /** Default value of one component of feature vector. */ final static float DEFAULTVALUE = 1.0f; /** Constructor in trival case. */ public DocFeatureVectors() { numInstances = 0; } /** * The main method for obtaining the sparse feature vector and label(s) from * the NLP features of each instance in the documents. */ public void obtainFVsFromNLPFeatures(NLPFeaturesOfDoc nlpDoc, NLPFeaturesList featList, int[] featurePosition, int maxNegPosition, int numDocs, float ngramWeight, int valueType) { numInstances = nlpDoc.numInstances; docId = new String(nlpDoc.getDocId()); fvs = new SparseFeatureVector[numInstances]; // For each istance for(int i = 0; i < numInstances; ++i) { Hashtable indexValues = new Hashtable(); int n = 0; String[] feat = nlpDoc.featuresInLine[i].toString().split( ConstantParameters.ITEMSEPARATOR); // Some variables for normalising the feature values of the same ngram // boolean sameNgram=true; int prevPosition = -99999; float[] tempVals = new float[9999]; long[] tempInds = new long[9999]; int tempSize = 0; int positionCurr = 0; for(int j = 0; j < feat.length; ++j) { // First get the position information for the current NLP feature positionCurr = 0; if(feat[j] != null && Pattern.matches((".+\\[[-0-9]+\\]$"), feat[j])) { int ind = feat[j].lastIndexOf('['); String positionStr = feat[j].substring(ind + 1, feat[j].length() - 1); positionCurr = Integer.parseInt(positionStr); feat[j] = feat[j].substring(0, ind); } if(prevPosition != positionCurr && tempSize > 0) { double sum = 0.0; for(int ii = 0; ii < tempSize; ++ii) sum += tempVals[ii] * tempVals[ii]; sum = Math.sqrt(sum); for(int ii = 0; ii < tempSize; ++ii) { tempVals[ii] /= sum; indexValues.put(new Long(tempInds[ii]), new Float(tempVals[ii])); } tempSize = 0; } String featCur = feat[j]; String featVal = null; int kk = -3; if(featCur.contains(NLPFeaturesList.SYMBOLNGARM)) { kk = feat[j].lastIndexOf(NLPFeaturesList.SYMBOLNGARM); featCur = feat[j].substring(0, kk); featVal = feat[j].substring(kk + 2); } if(featCur.length() > 0) { // if there is any feature if(featList.featuresList.containsKey(featCur)) { if(featCur.contains(NLPFeaturesList.SYMBOLNGARM)) { int shiftNum = 0; if(positionCurr > 0) shiftNum = maxNegPosition + positionCurr; else shiftNum = -positionCurr; long featInd = Long.parseLong(featList.featuresList.get(featCur) .toString()) + shiftNum * ConstantParameters.MAXIMUMFEATURES; double val = 0.0; switch(valueType){ case 1: // for only the presence of Ngram in the sentence val = 1.0; break; case 2: // for tf representation val = Long.parseLong(featVal); break; case 3: // for tf*idf representation val = (Long.parseLong(featVal) + 1) * Math.log((double)numDocs / (Long.parseLong(featList.idfFeatures.get(featCur) .toString()))); break; default: try { throw new GateException( "The value type for ngram is not defined!"); } catch(GateException e) { e.printStackTrace(); } } // indexValues.put(featInd, new Float(val)); tempInds[tempSize] = featInd; tempVals[tempSize] = (float)val; ++tempSize; } else { if(positionCurr == 0) indexValues.put(featList.featuresList.get(feat[j]), "1"); else if(positionCurr < 0) indexValues.put(new Long((Long.parseLong(featList.featuresList .get(feat[j]).toString()) - positionCurr * ConstantParameters.MAXIMUMFEATURES)), new Float(-1.0 / (double)positionCurr)); else indexValues.put( new Long((Long.parseLong(featList.featuresList.get(feat[j]) .toString()) + (positionCurr + maxNegPosition) * ConstantParameters.MAXIMUMFEATURES)), new Float( 1.0 / (double)positionCurr)); } ++n; } } prevPosition = positionCurr; } // end of the loop on the features of one instances // For the last ngram features if(tempSize > 0) { if(valueType == 3) { double sum = 0.0; for(int ii = 0; ii < tempSize; ++ii) sum += tempVals[ii] * tempVals[ii]; sum = Math.sqrt(sum); for(int ii = 0; ii < tempSize; ++ii) { tempVals[ii] /= sum; tempVals[ii] *= ngramWeight; indexValues.put(new Long(tempInds[ii]), new Float(tempVals[ii])); } } else { for(int ii = 0; ii < tempSize; ++ii) { indexValues.put(new Long(tempInds[ii]), new Integer( (int)tempVals[ii])); } } tempSize = 0; } // if(LogService.minVerbosityLevel > 1) // if(n != nlpDoc.featuresCounted[i]) { // System.out.println("Error: the number of features (" + n // + ") is not the same as the number recorded (" // + nlpDoc.featuresCounted[i] + ")in document " + docId); // } // sort the indexes in ascending order List indexes = new ArrayList(indexValues.keySet()); Collections.sort(indexes, new LongCompactor()); // Iterator iterator = indexes.iterator(); // n = 0; // while(iterator.hasNext()) { // Object key = iterator.next(); // fvs[i].indexes[n] = ((Long)key).longValue(); fvs[i] = new SparseFeatureVector(indexes.size()); // for(int j=0; j<n; ++j) { for(int j = 0; j < indexes.size(); ++j) { // System.out.println(new Integer(j) +" index=*"+ // indexes.get(j)+"*"); fvs[i].nodes[j].index = Integer.parseInt(indexes.get(j).toString()); // for the constant value 1 // fvs[i].values[j] = DEFAULTVALUE; // for the tf or tf*idf value fvs[i].nodes[j].value = Double.parseDouble(indexValues.get(indexes.get(j)) .toString()); } } // end of the loop on the instances } /** A static class for comparing two long numbers. */ public static class LongCompactor implements java.util.Comparator { public int compare(Object l1, Object l2) { // return (new Long((new Long(l1.toString()).longValue()- new // Long(l2.toString()).longValue()))).intValue(); return (int)(Long.parseLong(l1.toString()) - Long .parseLong(l2.toString())); } } /** Read the feature vectors of a document from the feature vector file. */ public void readDocFVFromFile(BufferedReader dataFile, int num, LabelsOfFeatureVectorDoc labelsDoc) { numInstances = num; fvs = new SparseFeatureVector[numInstances]; labelsDoc.multiLabels = new LabelsOfFV[numInstances]; try { String line; for(int i = 0; i < num; ++i) { line = dataFile.readLine(); // System.out.println("i="+i+"line="+line); String[] items = line.split(ConstantParameters.ITEMSEPARATOR); // get the label from the line int iEndLabel; // get the multilabel directly iEndLabel = obtainMultiLabels(items, labelsDoc.multiLabels, i); // get the feature vector int len = items.length - iEndLabel; if(len == 0) { // If there is no feature vector, creat one for it fvs[i] = new SparseFeatureVector(1); fvs[i].nodes[0].index = 1; fvs[i].nodes[0].value = 0; } else { fvs[i] = new SparseFeatureVector(len); obtainFVs(items, iEndLabel, len, fvs[i]); } } } catch(IOException e) { e.printStackTrace(); } return; } /** Get the multi label(s) from one line of feature vector. */ private int obtainMultiLabels(String[] items, LabelsOfFV[] multiLabels, int i) { int num; int kk = 1; num = Integer.valueOf(items[kk++]); multiLabels[i] = new LabelsOfFV(num); if(num > 0) { multiLabels[i].labels = new int[num]; for(int j = 0; j < num; ++j) multiLabels[i].labels[j] = Integer.valueOf(items[kk++]); } return kk; } /** Get the feature vector in parse format from a String arrays. */ private void obtainFVs(String[] items, int iEndLabel, int len, SparseFeatureVector fv) { String[] indexValue; for(int i = 0; i < len; ++i) { indexValue = items[i + iEndLabel] .split(ConstantParameters.INDEXVALUESEPARATOR); if(indexValue.length <= 1) { System.out.println("i=" + i + " item=" + items[i + iEndLabel]); } fv.nodes[i].index = (new Integer(indexValue[0])).intValue(); fv.nodes[i].value = (new Float(indexValue[1])).floatValue(); } return; } /** Get number of instances in the document. */ public int getNumInstances() { return numInstances; } /** Get the fv array. */ public SparseFeatureVector[] getFvs() { return fvs; } /** Delete the fv array. */ public void deleteFvs() { for(int i=0; i<fvs.length; ++i) fvs[i] = null; //fvs= null; } /** Set the DocID. */ public void setDocID(String docI) { this.docId = docI; } /** Expand the feature vector to including the context tokens. */ public void expandFV(int winSizeLeft, int winSizeRight) { SparseFeatureVector[] fvsExpand = new SparseFeatureVector[fvs.length]; for(int i = 0; i < fvs.length; ++i) { int lenT0 = fvs[i].len; for(int j = -1; j >= -winSizeLeft; --j) { if(j + i >= 0) lenT0 += fvs[j + i].len; } for(int j = 1; j <= winSizeRight; ++j) if(j + i < fvs.length) lenT0 += fvs[j + i].len; fvsExpand[i] = new SparseFeatureVector(lenT0); // System.out.println("lent0="+lenT0); for(int j1 = 0; j1 < fvs[i].len; ++j1) { fvsExpand[i].nodes[j1].index = fvs[i].nodes[j1].index; fvsExpand[i].nodes[j1].value = fvs[i].nodes[j1].value; } int lenTotal = fvs[i].len; for(int j = -1; j >= -winSizeLeft; --j) { int kk = j + i; if(kk >= 0) { int gapLen = -j * (int)ConstantParameters.MAXIMUMFEATURES; for(int j1 = 0; j1 < fvs[kk].len; ++j1) { if(j1 + lenTotal >= lenT0) System.out.println("i=" + i + ", j=" + j + ",j1=" + j1 + ", newlen=" + lenTotal); fvsExpand[i].nodes[j1 + lenTotal].index = fvs[kk].nodes[j1].index + gapLen; fvsExpand[i].nodes[j1 + lenTotal].value = fvs[kk].nodes[j1].value / (-j); } lenTotal += fvs[kk].len; } } for(int j = 1; j <= winSizeRight; ++j) { int kk = j + i; if(kk < fvs.length) { int gapLen = (j + winSizeLeft) * (int)ConstantParameters.MAXIMUMFEATURES; for(int j1 = 0; j1 < fvs[kk].len; ++j1) { fvsExpand[i].nodes[j1 + lenTotal].index = fvs[kk].nodes[j1].index + gapLen; fvsExpand[i].nodes[j1 + lenTotal].value = fvs[kk].nodes[j1].value / j; } lenTotal += fvs[kk].len; } } }// end of the loop for each fv fvs = fvsExpand; } /** Write the FVs of one document into file. */ public void addDocFVsToFile(int index, BufferedWriter out, int[] labels) { try { out.write(new Integer(index) + ConstantParameters.ITEMSEPARATOR + new Integer(numInstances) + ConstantParameters.ITEMSEPARATOR + docId); out.newLine(); for(int i = 0; i < numInstances; ++i) { StringBuffer line = new StringBuffer(); line.append(new Integer(i + 1) + ConstantParameters.ITEMSEPARATOR + new Integer(labels[i])); for(int j = 0; j < fvs[i].len; ++j) line.append(ConstantParameters.ITEMSEPARATOR + fvs[i].nodes[j].index + ConstantParameters.INDEXVALUESEPARATOR + fvs[i].nodes[j].value); out.write(line.toString()); out.newLine(); } } catch(IOException e) { } } /** Write the FVs with labels of one document into file. */ public void addDocFVsMultiLabelToFile(int index, BufferedWriter out, LabelsOfFV[] multiLabels) { try { out.write(new Integer(index) + ConstantParameters.ITEMSEPARATOR + new Integer(numInstances) + ConstantParameters.ITEMSEPARATOR + docId); out.newLine(); for(int i = 0; i < numInstances; ++i) { StringBuffer line = new StringBuffer(); line.append(new Integer(i + 1) + ConstantParameters.ITEMSEPARATOR + multiLabels[i].num); for(int j = 0; j < multiLabels[i].num; ++j) line.append(ConstantParameters.ITEMSEPARATOR + multiLabels[i].labels[j]); for(int j = 0; j < fvs[i].len; ++j) line.append(ConstantParameters.ITEMSEPARATOR + fvs[i].nodes[j].index + ConstantParameters.INDEXVALUESEPARATOR + fvs[i].nodes[j].value); out.write(line.toString()); out.newLine(); } } catch(IOException e) { } } }