/*
* DataForLearning.java
*
* Yaoyong Li 22/03/2007
*
* $Id: DataForLearning.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
*/
package gate.learning.learners;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import gate.learning.ConstantParameters;
import gate.learning.DocFeatureVectors;
import gate.learning.LabelsOfFeatureVectorDoc;
import gate.learning.SparseFeatureVector;
import gate.learning.learners.svm.svm_node;
/**
* Data used for learning, read from the feature vector file.
*/
public class DataForLearning {
/** Number of training (or test) documents. */
private int numTrainingDocs;
/** Training feature vectors, array for each document. */
public DocFeatureVectors[] trainingFVinDoc = null;
/** Training feature vectors in svm_node format, for libSVM. */
public svm_node[][] svmNodeFVs = null;
/** Labels for each feature vector, array for each document. */
public LabelsOfFeatureVectorDoc[] labelsFVDoc = null;
/** All the unique labels in the dataset */
String[] allUniqueLabels;
/** Total number of training examples. */
int numTraining = 0;
/** Total number of NLP features in FVs. */
int totalNumFeatures = 0;
/** Trivial constructor. */
public DataForLearning() {
}
/** Constructor with the number of documents. */
public DataForLearning(int num) {
this.numTrainingDocs = num;
}
/** Read the feature vectors from data file for training or application. */
public void readingFVsFromFile(File trainingData, boolean isUsingFile, File tempFVDataFile) {
// the array to store the training data
trainingFVinDoc = new DocFeatureVectors[numTrainingDocs];
labelsFVDoc = new LabelsOfFeatureVectorDoc[numTrainingDocs];
//Open the
// read the training data from the file
// first open the training data file
try {
// Write the fv data into a file if the isUsingFile is true
BufferedWriter fvTempWr = null;
if(isUsingFile) {
fvTempWr = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(tempFVDataFile), "UTF-8"));
}
// compute the total number of features
totalNumFeatures = 0;
BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream(trainingData), "UTF-8"));
String line;
String[] items;
for(int iCounter = 0; iCounter < numTrainingDocs; ++iCounter) {
final int i = iCounter;
line = in.readLine();
while(line.startsWith("#"))
line = in.readLine();
items = line
.split(gate.learning.ConstantParameters.ITEMSEPARATOR);
int num;
num = (new Integer(items[1])).intValue();
trainingFVinDoc[i] = new DocFeatureVectors();
trainingFVinDoc[i].setDocID(items[2]);
labelsFVDoc[i] = new LabelsOfFeatureVectorDoc();
trainingFVinDoc[i].readDocFVFromFile(in, num, labelsFVDoc[i]);
SparseFeatureVector[] fvs = trainingFVinDoc[i].getFvs();
for(int j = 0; j < trainingFVinDoc[i].getNumInstances(); ++j) {
//int[] indexes = fvs[j].getIndexes();
//if(totalNumFeatures < indexes[indexes.length - 1])
//totalNumFeatures = indexes[indexes.length - 1];
int len = fvs[j].nodes.length;
if(totalNumFeatures < fvs[j].nodes[len - 1].index)
totalNumFeatures = fvs[j].nodes[len - 1].index;
}
if(isUsingFile) {
SparseFeatureVector[] fvsInDoc = trainingFVinDoc[i].getFvs();
// For each instance
for(int jC = 0; jC < fvsInDoc.length; ++jC) {
final int j = jC;
int lenM1 = fvsInDoc[j].getLen()-1;
if(lenM1>0) {
for(int j1=0; j1<lenM1; ++j1)
fvTempWr.append(fvsInDoc[j].nodes[j1].index+":"+fvsInDoc[j].nodes[j1].value+" ");
fvTempWr.append(fvsInDoc[j].nodes[lenM1].index+":"+fvsInDoc[j].nodes[lenM1].value+"\n");
}
//fvsInDoc[j] = null; //trying to remove the data from the memory
}
trainingFVinDoc[i].deleteFvs(); //trying to remove the fv data from memory
}
}//end of loop for each document
if(isUsingFile) fvTempWr.close();
in.close();
// compute the total number of training examples
numTraining = 0;
for(int i = 0; i < numTrainingDocs; ++i)
numTraining += trainingFVinDoc[i].getNumInstances();
// add 3 for safety, because the index is counted from 1, not 0
totalNumFeatures += 5;
} catch(IOException e) {
}
return;
}
/** Read the feature vectors from data file for training or application. */
public void readingFVsMultiLabelFromFile(File trainingData) {
// the array to store the training data
trainingFVinDoc = new DocFeatureVectors[numTrainingDocs];
labelsFVDoc = new LabelsOfFeatureVectorDoc[numTrainingDocs];
// read the training data from the file
// first open the training data file
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(
trainingData), "UTF-8"));
String line;
String[] items;
for(int i = 0; i < numTrainingDocs; ++i) {
line = in.readLine();
while(line.startsWith("#"))
line = in.readLine();
items = line
.split(gate.learning.ConstantParameters.ITEMSEPARATOR);
int num;
num = (new Integer(items[1])).intValue();
trainingFVinDoc[i] = new DocFeatureVectors();
labelsFVDoc[i] = new LabelsOfFeatureVectorDoc();
trainingFVinDoc[i].readDocFVFromFile(in, num, labelsFVDoc[i]);
}
// compute the total number of training examples
numTraining = 0;
for(int i = 0; i < numTrainingDocs; ++i)
numTraining += trainingFVinDoc[i].getNumInstances();
// compute the total number of features
totalNumFeatures = 0;
for(int i = 0; i < numTrainingDocs; ++i) {
SparseFeatureVector[] fvs = trainingFVinDoc[i].getFvs();
for(int j = 0; j < trainingFVinDoc[i].getNumInstances(); ++j) {
//int[] indexes = fvs[j].getIndexes();
//if(totalNumFeatures < indexes[indexes.length - 1])
//totalNumFeatures = indexes[indexes.length - 1];
int len = fvs[j].nodes.length;
if(totalNumFeatures < fvs[j].nodes[len-1].index)
totalNumFeatures = fvs[j].nodes[len - 1].index;
}
}
// add 3 for safety, because the index is counted from 1, not 0
totalNumFeatures += 5;
} catch(IOException e) {
}
return;
}
public int getTotalNumFeatures() {
return this.totalNumFeatures;
}
public int getNumTrainingDocs() {
return this.numTrainingDocs;
}
public int getNumTraining() {
return this.numTraining;
}
}