Log in Help
Print
Homereleasesgate-5.1-beta2-build3402-ALLpluginsLearningsrcgatelearninglearners 〉 DataForLearning.java
 
/*
 *  DataForLearning.java
 * 
 *  Yaoyong Li 22/03/2007
 *
 *  $Id: DataForLearning.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
 */
package gate.learning.learners;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

import gate.learning.ConstantParameters;
import gate.learning.DocFeatureVectors;
import gate.learning.LabelsOfFeatureVectorDoc;
import gate.learning.SparseFeatureVector;
import gate.learning.learners.svm.svm_node;
/**
 * Data used for learning, read from the feature vector file.
 */
public class DataForLearning {
  /** Number of training (or test) documents. */
  private int numTrainingDocs;
  /** Training feature vectors, array for each document. */
  public DocFeatureVectors[] trainingFVinDoc = null;
  /** Training feature vectors in svm_node format, for libSVM. */
  public svm_node[][] svmNodeFVs = null;
  /** Labels for each feature vector, array for each document. */
  public LabelsOfFeatureVectorDoc[] labelsFVDoc = null;
  /** All the unique labels in the dataset */
  String[] allUniqueLabels;
  /** Total number of training examples. */
  int numTraining = 0;
  /** Total number of NLP features in FVs. */
  int totalNumFeatures = 0;
  /** Trivial constructor. */
  public DataForLearning() {
  }
  /** Constructor with the number of documents. */
  public DataForLearning(int num) {
    this.numTrainingDocs = num;
  }

  /** Read the feature vectors from data file for training or application. */
  public void readingFVsFromFile(File trainingData, boolean isUsingFile, File tempFVDataFile) {
    // the array to store the training data
    trainingFVinDoc = new DocFeatureVectors[numTrainingDocs];
    labelsFVDoc = new LabelsOfFeatureVectorDoc[numTrainingDocs];
    //Open the 
    // read the training data from the file
    // first open the training data file
    try {
      //    Write the fv data into a file if the isUsingFile is true
      BufferedWriter fvTempWr = null;
      if(isUsingFile) {
        fvTempWr = new BufferedWriter(new OutputStreamWriter(
        new FileOutputStream(tempFVDataFile), "UTF-8"));
      }
       //    compute the total number of features
      totalNumFeatures = 0;
      BufferedReader in = new BufferedReader(new InputStreamReader(
        new FileInputStream(trainingData), "UTF-8"));
      String line;
      String[] items;
      for(int iCounter = 0; iCounter < numTrainingDocs; ++iCounter) {
        final int i = iCounter;
        line = in.readLine();
        while(line.startsWith("#"))
          line = in.readLine();
        items = line
          .split(gate.learning.ConstantParameters.ITEMSEPARATOR);
        int num;
        num = (new Integer(items[1])).intValue();
        trainingFVinDoc[i] = new DocFeatureVectors();
        trainingFVinDoc[i].setDocID(items[2]);
        labelsFVDoc[i] = new LabelsOfFeatureVectorDoc();
        trainingFVinDoc[i].readDocFVFromFile(in, num, labelsFVDoc[i]);
        
        SparseFeatureVector[] fvs = trainingFVinDoc[i].getFvs();
        for(int j = 0; j < trainingFVinDoc[i].getNumInstances(); ++j) {
          //int[] indexes = fvs[j].getIndexes();
          //if(totalNumFeatures < indexes[indexes.length - 1])
            //totalNumFeatures = indexes[indexes.length - 1];
          int len = fvs[j].nodes.length;
          if(totalNumFeatures < fvs[j].nodes[len - 1].index)
            totalNumFeatures = fvs[j].nodes[len - 1].index;
        }
        
        if(isUsingFile) {
          SparseFeatureVector[] fvsInDoc = trainingFVinDoc[i].getFvs();
          // For each instance
          for(int jC = 0; jC < fvsInDoc.length; ++jC) {
          final int j = jC;
            int lenM1 = fvsInDoc[j].getLen()-1;
            if(lenM1>0) {
              for(int j1=0; j1<lenM1; ++j1)
                fvTempWr.append(fvsInDoc[j].nodes[j1].index+":"+fvsInDoc[j].nodes[j1].value+" ");
              fvTempWr.append(fvsInDoc[j].nodes[lenM1].index+":"+fvsInDoc[j].nodes[lenM1].value+"\n");
            }
            //fvsInDoc[j] = null; //trying to remove the data from the memory
          }
          trainingFVinDoc[i].deleteFvs(); //trying to remove the fv data from memory
        }
      }//end of loop for each document
      if(isUsingFile) fvTempWr.close();
      in.close();
      // compute the total number of training examples
      numTraining = 0;
      for(int i = 0; i < numTrainingDocs; ++i)
        numTraining += trainingFVinDoc[i].getNumInstances();
        
      // add 3 for safety, because the index is counted from 1, not 0
      totalNumFeatures += 5;
    } catch(IOException e) {
    }
    return;
  }

  /** Read the feature vectors from data file for training or application. */
  public void readingFVsMultiLabelFromFile(File trainingData) {
    // the array to store the training data
    trainingFVinDoc = new DocFeatureVectors[numTrainingDocs];
    labelsFVDoc = new LabelsOfFeatureVectorDoc[numTrainingDocs];
    // read the training data from the file
    // first open the training data file
    try {
      BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(
        trainingData), "UTF-8"));
      String line;
      String[] items;
      for(int i = 0; i < numTrainingDocs; ++i) {
        line = in.readLine();
        while(line.startsWith("#"))
          line = in.readLine();
        items = line
          .split(gate.learning.ConstantParameters.ITEMSEPARATOR);
        int num;
        num = (new Integer(items[1])).intValue();
        trainingFVinDoc[i] = new DocFeatureVectors();
        labelsFVDoc[i] = new LabelsOfFeatureVectorDoc();
        trainingFVinDoc[i].readDocFVFromFile(in, num, labelsFVDoc[i]);
      }
      // compute the total number of training examples
      numTraining = 0;
      for(int i = 0; i < numTrainingDocs; ++i)
        numTraining += trainingFVinDoc[i].getNumInstances();
      // compute the total number of features
      totalNumFeatures = 0;
      for(int i = 0; i < numTrainingDocs; ++i) {
        SparseFeatureVector[] fvs = trainingFVinDoc[i].getFvs();
        for(int j = 0; j < trainingFVinDoc[i].getNumInstances(); ++j) {
          //int[] indexes = fvs[j].getIndexes();
          //if(totalNumFeatures < indexes[indexes.length - 1])
            //totalNumFeatures = indexes[indexes.length - 1];
          int len = fvs[j].nodes.length;
          if(totalNumFeatures < fvs[j].nodes[len-1].index)
            totalNumFeatures = fvs[j].nodes[len - 1].index;
        }
      }
      // add 3 for safety, because the index is counted from 1, not 0
      totalNumFeatures += 5;
    } catch(IOException e) {
    }
    return;
  }

  public int getTotalNumFeatures() {
    return this.totalNumFeatures;
  }

  public int getNumTrainingDocs() {
    return this.numTrainingDocs;
  }

  public int getNumTraining() {
    return this.numTraining;
  }
}