/*
* ChineseSegMain.java
*
* Yaoyong Li 23/04/2009
*
* $Id: ChineseSegMain.java, v 1.0 2009-04-23 12:58:16 +0000 yaoyong $
*/
package gate.chineseSeg;
/**
*
* Learn a model from the segmented Chinese text and apply the learned
* model to segment Chinese text.
*
*/
import gate.Document;
import gate.Factory;
import gate.ProcessingResource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.learning.DocFeatureVectors;
import gate.learning.Label2Id;
import gate.learning.LabelsOfFV;
import gate.learning.LabelsOfFeatureVectorDoc;
import gate.learning.LearningEngineSettings;
import gate.learning.LogService;
import gate.learning.NLPFeaturesList;
import gate.learning.SparseFeatureVector;
import gate.learning.learners.MultiClassLearning;
import gate.learning.learners.PostProcessing;
import gate.learning.learners.SupervisedLearner;
import gate.util.ExtensionFileFilter;
import gate.util.GateException;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.util.Arrays;
import java.util.Comparator;
public class ChineseSegMain extends AbstractLanguageAnalyser implements
ProcessingResource {
URL modelURL = null;
URL textFilesURL = null;
String textCode = null;
String learningAlg = null;
private RunMode learningMode;
private RunMode learningModeAppl;
private RunMode learningModeTraining;
/** Initialise this resource, and return it. */
public gate.Resource init() throws ResourceInstantiationException {
this.learningModeAppl = RunMode.SEGMENTING;
this.learningModeTraining = RunMode.LEARNING;
return this;
} // init()
/**
* Run the resource.
*
* @throws ExecutionException
*/
public void execute() throws ExecutionException {
System.out.println("\n\n------------ new session starts ------------\n");
if(corpus != null) {
if(corpus.size() != 0) if(corpus.indexOf(document) > 0) return;
}
boolean isUpdateFeatList = true;
// boolean isTraining = true;
boolean isTraining = false;
File wdResults = null;
File logFile = null;
int verbosityLogService = 1;
BufferedWriter outFeatureVectors = null;
// do the initialisation work before the first document
isTraining = false;
if(this.learningMode.equals(this.learningModeTraining))
isTraining = true;
else isTraining = false;
isUpdateFeatList = isTraining;
//isUpdateFeatList = true;
if(isTraining) {
System.out.println("Learning a new model from the segmented text...");
System.out.println("Learning algorithm used is "+this.learningAlg);
System.out.println("the model files will be stored in "+modelURL.toString());
System.out.println("the text used for learning are in "+this.textFilesURL.toExternalForm());
} else {
System.out.println("Applying the learned model to segment Chinese text...");
System.out.println("Learning algorithm used is "+this.learningAlg);
System.out.println("the model files used are stored in "+modelURL.toString());
System.out.println("the text for segmenting are in "+this.textFilesURL.toExternalForm());
}
verbosityLogService = 1;
// load the existing terms and labels
// File wdResults = new
// File("C:\\yaoyong\\javawk\\chineseSeg\\test\\data\\",
// ConstantParameters.FILENAME_resultsDir);
wdResults = new File(modelURL.getPath());
if(!wdResults.exists()) wdResults.mkdir();
logFile = new File(wdResults, ConstantParameters.FILENAMEOFLOGFILE);
try {
LogService.init(logFile, true, verbosityLogService);
// read the feature list from the file
NLPFeaturesList featuresList = null;
featuresList = new NLPFeaturesList();
featuresList.loadFromFile(wdResults, ConstantParameters.FILENAME_TERMS, this.textCode);
if(!featuresList.featuresList.containsKey(ConstantParameters.NONFEATURE)) {
int size = featuresList.featuresList.size() + 1;
featuresList.featuresList.put(ConstantParameters.NONFEATURE,
new Integer(size));
featuresList.idfFeatures.put(ConstantParameters.NONFEATURE,
new Integer(1));
}
// read the label list
Label2Id labelsAndId;
labelsAndId = new Label2Id();
labelsAndId.loadLabelAndIdFromFile(wdResults,
ConstantParameters.FILENAMEOFLabelList);
// determine the text files.
//ExtensionFileFilter fileFilter = new ExtensionFileFilter();
//fileFilter.addExtension("txt");
ExtensionFileFilter fileFilter = null;
File[] xmlFiles = new File(this.textFilesURL.getPath())
.listFiles(fileFilter);
Arrays.sort(xmlFiles, new Comparator<File>() {
public int compare(File a, File b) {
return a.getName().compareTo(b.getName());
}
});
// corpus.populate(new
// File("C:\\yaoyong\\javawk\\chineseSeg\\test\\data").toURI().toURL(),
// fileFilter, "UTF-8", false);
// corpus.populate(new File(
// "C:\\yaoyong\\javawk\\chineseSeg\\test\\data\\labels").toURI().toURL(),
// fileFilter, "UTF-8", false);
// corpus.populate(new
// File("C:\\yaoyong\\javawk\\chineseSeg\\test\\data\\gb2312").toURI().toURL(),
// fileFilter, "GB18030", false);
if(isTraining) { // open the feature vector file for storing the fvs
outFeatureVectors = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(new File(wdResults,
ConstantParameters.FILENAMEOFFeatureVectorData)), "UTF-8"));
}
File dirSeg = null;
if(!isTraining) {
dirSeg = new File(this.textFilesURL.getPath(), ConstantParameters.FILENAME_resultsDir);
if(!dirSeg.exists())
dirSeg.mkdir();
}
int numDocs = 0;
for(File f : xmlFiles) { //for each file in the directory
if(!f.isDirectory()) {
// for(; iIndex < corpus.size(); ++iIndex) {
++numDocs;
// read the text from a document
Document doc = Factory.newDocument(f.toURI().toURL(), this.textCode);
doc.setName(f.getName());
System.out.println(numDocs + ", docName=" + doc.getName());
//System.out.println(numDocs + ", content=" + doc.getContent() + "*");
String text = doc.getContent().toString();
// convert the document into an array of characters with replacements
char[] chs = new char[text.length()];
int num;
StringBuffer letterNum = new StringBuffer();
num = convert2Chs(text, chs, letterNum);
// get the labels
String[] labels = new String[chs.length];
if(isTraining) {
num = obtainLabels(num, chs, labels);
// update the labels
labelsAndId.updateMultiLabelFromDoc(labels);
}
/*for(int j = 0; j < num; ++j) {
System.out.println(j + ", ch=*" + chs[j] + "* type="
+ Character.getType(chs[j]) + ", labels=*" + labels[j] + "*");
}*/
//System.out.println("** LN=" + letterNum);
// get the features from the array
String[] termC1 = new String[num + 2]; // with begin and start char
// added
String[] termC12 = new String[num + 1]; // for the c0c1
String[] termC13 = new String[num]; // for the c-1c1 feature
obtainTerms(num, chs, termC1, termC12, termC13);
// update the feature list
if(isUpdateFeatList) {
updateFeatList(featuresList, termC1);
updateFeatList(featuresList, termC12);
updateFeatList(featuresList, termC13);
}
// get the real features from the terms
DocFeatureVectors docFV = new DocFeatureVectors();
docFV.docId = new String(doc.getName());
// System.out.println("888 feat=旅"+"*,
// id="+featuresList.featuresList.get("旅").toString()+"*");
// for(Object obj:featuresList.featuresList.keySet()) {
// System.out.println("feat="+obj.toString()+",
// id="+featuresList.featuresList.get(obj).toString());
// }
putFeatsIntoDocFV(featuresList, termC1, termC12, termC13, docFV);
LabelsOfFV[] multiLabels = new LabelsOfFV[num];
for(int j = 0; j < num; ++j) {
int[] labelsId = new int[1];
if(isTraining)
labelsId[0] = new Integer(labelsAndId.label2Id.get(labels[j])
.toString()).intValue();
else labelsId[0] = -1;
float[] labelPr = new float[1];
labelPr[0] = 1;
multiLabels[j] = new LabelsOfFV(1, labelsId, labelPr);
}
System.out.println("numInstance=" + docFV.numInstances);
//BufferedWriter outSegTextLabel = null;
BufferedWriter outSegText = null;
if(!isTraining) {
outFeatureVectors = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(new File(wdResults,
ConstantParameters.FILENAMEOFFeatureVectorData)), "UTF-8"));
outSegText = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(new File(dirSeg,
f.getName()+".seg.txt")), this.textCode));
//outSegTextLabel = new BufferedWriter(new OutputStreamWriter(
// new FileOutputStream(new File(dirSeg,
// f.getName()+".segLabel.txt")), "UTF-8"));
}
docFV.addDocFVsMultiLabelToFile(numDocs, outFeatureVectors,
multiLabels);
if(!isTraining) {
outFeatureVectors.flush();
outFeatureVectors.close();
}
if(!isTraining) { // for application of the models
int[] selectedLabels = null;
selectedLabels = segementText(wdResults, this.learningAlg);
// get back to the replacements for letter, number and newline.
String[] terms = letterNum.toString().split(
ConstantParameters.SEPARATTORLN);
int kk = 0;
StringBuffer textSeg = new StringBuffer();
for(int j = 0; j < num; ++j) { //for each character
// System.out.println(j + ", ch=*" + chs[j] + "* type="
// + Character.getType(chs[j]) + ", labels=*" + selectedLabels[j]
// +
// "*");
String labelC = null;
String iObj = new Integer(selectedLabels[j]+1).toString();
if(labelsAndId.id2Label.containsKey(iObj))
labelC = labelsAndId.id2Label.get(iObj).toString();
if(chs[j] == ConstantParameters.REPLACEMENT_Digit
|| chs[j] == ConstantParameters.REPLACEMENT_Letter
|| chs[j] == ConstantParameters.NEWLINE_Char) {
// System.out.print(terms[kk] + "(" + labelC + ") ");
//outSegTextLabel.append(terms[kk] + "(" + labelC + ") ");
textSeg.append(terms[kk]);
if(labelC.equals(ConstantParameters.LABEL_R) ||
labelC.equals(ConstantParameters.LABEL_S))
textSeg.append(ConstantParameters.SEPARATTOR_BLANK);
++kk;
} else if(chs[j] == ConstantParameters.REPLACEMENT_BLANK) {
//System.out.print(ConstantParameters.SEPARATTOR_BLANK + "(null ) ");
//outSegTextLabel.append(ConstantParameters.SEPARATTOR_BLANK + "(null ) ");
textSeg.append(ConstantParameters.SEPARATTOR_BLANK);
} else {
// System.out.print(chs[j] + "(" + labelC + ") ");
//outSegTextLabel.append(chs[j] + "(" + labelC + ") ");
textSeg.append(chs[j]);
if(labelC.equals(ConstantParameters.LABEL_R) || labelC.equals(ConstantParameters.LABEL_S))
textSeg.append(ConstantParameters.SEPARATTOR_BLANK);
}
}//end of the loop for each character
outSegText.append(textSeg);
outSegText.flush();
outSegText.close();
//outSegTextLabel.flush();
//outSegTextLabel.close();
}
//unload the document from GATE
Factory.deleteResource(doc);
}
}// end of the loop for each document in the text dir
if(isTraining) {
outFeatureVectors.flush();
outFeatureVectors.close();
}
if(isUpdateFeatList) {
featuresList.writeListIntoFile(wdResults,
ConstantParameters.FILENAME_TERMS, this.textCode);
}
// write the labels into the file
if(isTraining) {
labelsAndId.writeLabelAndIdToFile(wdResults,
ConstantParameters.FILENAMEOFLabelList);
}
if(isTraining) learningNewModel(wdResults, numDocs, this.learningAlg);
//System.out.println("totalN=" + totalN);
System.out.println("Number of documents used is " + numDocs);
System.out.println("Finished!");
}
catch(IOException e) {
e.printStackTrace();
}
catch(ResourceInstantiationException e) {
e.printStackTrace();
}
catch(GateException e) {
e.printStackTrace();
}
}
static void learningNewModel(File wdResults, int numDocs, String learningAlg)
throws GateException {
String fvFileName = wdResults.toString() + File.separator
+ ConstantParameters.FILENAMEOFFeatureVectorData;
File dataFile = new File(fvFileName);
String modelFileName = wdResults.toString() + File.separator
+ ConstantParameters.FILENAMEOFModels;
File modelFile = new File(modelFileName);
String learningCommand = " ";
String dataSetFile = null;
String learningParas = null;
String learningExec = null;
SupervisedLearner paumLearner = null;
if(learningAlg.equalsIgnoreCase("SVM")) {
learningParas = " -c 0.7 -t 0 -m 100 -tau 0.8 ";
learningExec = " ";
learningCommand = learningExec + " "+ learningParas;
paumLearner = MultiClassLearning.obtainLearnerFromName(
"SVMLibSvmJava", learningCommand, dataSetFile);
} else if(learningAlg.startsWith("PAUMExec ")) {
learningParas = " -p 20 -n 1 ";
String[] items = learningAlg.split(" ");
if(items.length<4) {
//System.out.println();
throw new GateException("There is no enough parameter for the learning "
+"algorithm PAUM");
}
learningExec = items[1];
learningCommand = learningExec + " "+ learningParas + " "
+ items[2] + " "+items[3];
paumLearner = MultiClassLearning.obtainLearnerFromName(
"PAUMExec", learningCommand, dataSetFile);
} else {
learningParas = " -p 20 -n 1 ";
learningExec = " ";
learningCommand = learningExec + " "+ learningParas;
paumLearner = MultiClassLearning.obtainLearnerFromName(
"PAUM", learningCommand, dataSetFile);
}
paumLearner.setLearnerExecutable(learningExec);
paumLearner.setLearnerParams(learningParas);
MultiClassLearning chunkLearning = new MultiClassLearning(
LearningEngineSettings.OneVSOtherMode);
// read data
File tempDataFile = new File(wdResults,
ConstantParameters.TempFILENAMEofFVData);
boolean isUsingTempDataFile = false;
if(paumLearner.getLearnerName().equals("SVMExec")
|| paumLearner.getLearnerName().equals("PAUMExec"))
isUsingTempDataFile = true; // using the temp data file
chunkLearning.getDataFromFile(numDocs, dataFile, isUsingTempDataFile,
tempDataFile);
// training
// using different method for one thread or multithread
// if(engineSettings.numThreadUsed >1 )//for using thread
// chunkLearning.training(paumLearner, modelFile);
// else //for not using thread
chunkLearning.trainingNoThread(paumLearner, modelFile, isUsingTempDataFile,
tempDataFile);
}
/**
* segement the text using the learned model.
*
* @throws GateException
*/
static int[] segementText(File wdResults, String learningAlg) throws GateException {
int numDocs = 1;
String fvFileName = wdResults.toString() + File.separator
+ ConstantParameters.FILENAMEOFFeatureVectorData;
File dataFile = new File(fvFileName);
String modelFileName = wdResults.toString() + File.separator
+ ConstantParameters.FILENAMEOFModels;
File modelFile = new File(modelFileName);
String learningCommand = " ";
String dataSetFile = null;
String learningParas = null;
String learningExec = null;
SupervisedLearner paumLearner = null;
if(learningAlg.equalsIgnoreCase("SVM")) {
learningParas = " -c 0.7 -t 0 -m 100 -tau 0.8 ";
learningExec = " ";
learningCommand = learningExec + " "+ learningParas;
paumLearner = MultiClassLearning.obtainLearnerFromName(
"SVMLibSvmJava", learningCommand, dataSetFile);
} else if(learningAlg.startsWith("PAUMExec ")) {
learningParas = " -p 20 -n 1 -optB 0.0 ";
String[] items = learningAlg.split(" ");
if(items.length<4) {
//System.out.println();
throw new GateException("There is no enough parameter for the learning "
+"algorithm PAUM");
}
learningExec = items[1];
learningCommand = learningExec + " "+ learningParas + " "
+ items[2] + " "+items[3];
paumLearner = MultiClassLearning.obtainLearnerFromName(
"PAUMExec", learningCommand, dataSetFile);
} else {
learningParas = " -p 20 -n 1 -optB 0.0 ";
learningExec = " ";
learningCommand = learningExec + " "+ learningParas;
paumLearner = MultiClassLearning.obtainLearnerFromName(
"PAUM", learningCommand, dataSetFile);
}
paumLearner.setLearnerExecutable(learningExec);
paumLearner.setLearnerParams(learningParas);
MultiClassLearning chunkLearning = new MultiClassLearning(
LearningEngineSettings.OneVSOtherMode);
// read data
File tempDataFile = new File(wdResults,
ConstantParameters.TempFILENAMEofFVData);
boolean isUsingTempDataFile = false;
chunkLearning.getDataFromFile(numDocs, dataFile, isUsingTempDataFile,
tempDataFile);
chunkLearning.applyNoThread(paumLearner, modelFile);
LabelsOfFeatureVectorDoc[] labelsFVDoc = null;
labelsFVDoc = chunkLearning.dataFVinDoc.labelsFVDoc;
//int numClasses = chunkLearning.numClasses;
// applying to text
// String featName = engineSettings.datasetDefinition.arrs.classFeature;
// String instanceType = engineSettings.datasetDefinition.getInstanceType();
Label2Id labelsAndId = new Label2Id();
labelsAndId.loadLabelAndIdFromFile(wdResults,
ConstantParameters.FILENAMEOFLabelList);
// post-processing and add new annotation to the text
float boundaryP = 0;
float entityP = 0;
float thresholdClassificaton = -999;
PostProcessing postPr = new PostProcessing(boundaryP, entityP,
thresholdClassificaton);
int[] selectedLabels = new int[labelsFVDoc[0].multiLabels.length];
float[] valuesLabels = new float[labelsFVDoc[0].multiLabels.length];
postPr.postProcessingClassification((short)3, labelsFVDoc[0].multiLabels,
selectedLabels, valuesLabels);
return selectedLabels;
}
/**
* convert a text into an array of characters with replacements of letters and
* numbers.
*/
public static int convert2Chs(String text, char[] chs, StringBuffer letterNum) {
int num = 0;
boolean isL = false;
boolean isN = false;
boolean isR = false;
for(int ind = 0; ind < text.length(); ++ind) {
Character ch = text.charAt(ind);
if(isDelim(ch)) {//for the blank
chs[num] = ConstantParameters.REPLACEMENT_BLANK;
++num;
continue; // not use blank
}
int tc = Character.getType(ch);
if(tc == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) continue;
if(tc == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE) {
letterNum.append(ch);
if(!isR) {
if(isL || isN) letterNum.append(ConstantParameters.SEPARATTORLN);
chs[num] = ConstantParameters.NEWLINE_Char;
++num;
isR = true;
isN = false;
isL = false;
}
}
else if(tc == Character.LOWERCASE_LETTER
|| tc == Character.UPPERCASE_LETTER || tc == Character.TITLECASE_LETTER) {
letterNum.append(ch);
if(!isL) {
if(isN || isR) letterNum.append(ConstantParameters.SEPARATTORLN);
chs[num] = ConstantParameters.REPLACEMENT_Letter;
++num;
isL = true;
isN = false;
isR = false;
}
}
else if(Character.isDigit(ch)) {
letterNum.append(ch);
if(!isN) {
chs[num] = ConstantParameters.REPLACEMENT_Digit;
++num;
if(isL || isR) letterNum.append(ConstantParameters.SEPARATTORLN);
isN = true;
isL = false;
isR = false;
}
}
else {
if(isL || isN || isR) {
letterNum.append(ConstantParameters.SEPARATTORLN);
isL = false;
isN = false;
isR = false;
}
chs[num] = ch;
++num;
}
}
return num;
}
/** Obtain the labels for the text, i.e. l, m, r and s */
public static int obtainLabels(int numkk, char[] chs, String[] labels) {
// first add the space to the non-letter
int num = 0;
char[] chsSim11 = new char[3 * numkk + 1];
for(int id = 0; id < numkk; ++id) {
if(Character.isLetterOrDigit(chs[id]) || isDelim(chs[id])) {
chsSim11[num++] = chs[id];
}
else {
chsSim11[num++] = ConstantParameters.REPLACEMENT_BLANK;
chsSim11[num++] = chs[id];
chsSim11[num++] = ConstantParameters.REPLACEMENT_BLANK;
}
}
// Then remove the duplicate spaces
char[] chsSim = new char[3 * numkk + 1];
boolean[] isDelA = new boolean[3 * numkk + 1];
int num11 = 0;
boolean isD = false;
for(int id = 0; id < num; ++id) {
// if(Character.isWhitespace(chsSim11[id])) {
if(chsSim11[id] == ConstantParameters.REPLACEMENT_BLANK) {
if(!isD) {
isD = true;
chsSim[num11] = chsSim11[id];
isDelA[num11] = true;
++num11;
}
}
else {
isD = false;
chsSim[num11] = chsSim11[id];
isDelA[num11] = false;
++num11;
}
}
isDelA[num11++] = true;
// then get the labels for each character
int wS = 0;
int lenW = 0;
num = 0;
for(int id = 0; id < num11; ++id) {
if(isDelA[id]) { // if the current character is delimiter
lenW = id - wS;
if(lenW == 1) { // a word with single character
labels[num] = ConstantParameters.LABEL_S; // "4";
chs[num] = chsSim[id - 1];
++num;
wS = id + 1;
}
else if(lenW > 1) { // a word with multiple characters
labels[num] = ConstantParameters.LABEL_L; // "1";
chs[num] = chsSim[id - lenW];
for(int i = 1; i < lenW - 1; ++i) {
labels[num + i] = ConstantParameters.LABEL_M; // "2";
chs[num + i] = chsSim[id - lenW + i];
}
labels[num + lenW - 1] = ConstantParameters.LABEL_R; // "3";
chs[num + lenW - 1] = chsSim[id - 1];
num += lenW;
wS = id + 1;
}
}
}
return num;
}
static boolean isDelim(char ch) {
if(ch == ConstantParameters.SEPARATTOR_BLANK) return true;
if(ch == ConstantParameters.SEPARATTOR_BLANK_wide) return true;
return false;
}
/** obtain the terms from the list of characters for the text */
static void obtainTerms(int num00, char[] chs, String[] termC1,
String[] termC12, String[] termC13) {
// first the single character term
termC1[0] = new Character(ConstantParameters.BEGIN_Char).toString();
for(int i = 1; i <= num00; ++i)
termC1[i] = new String(chs, i - 1, 1);
termC1[num00 + 1] = new Character(ConstantParameters.END_Char).toString();
// then the two terms, one following another, like c1c2
for(int i = 0; i <= num00; ++i)
termC12[i] = termC1[i] + termC1[i + 1];
// finally the two terms, separated by one term, like c-1c1
for(int i = 0; i < num00; ++i) {
termC13[i] = termC1[i] + termC1[i + 2];
}
}
/** using the terms to update the feature list. */
static void updateFeatList(NLPFeaturesList featuresList, String[] terms) {
int size = featuresList.featuresList.size();
for(int ind = 0; ind < terms.length; ++ind) {
// If the featureName is not in the feature list
if(size < ConstantParameters.MAXIMUMFEATURES) {
if(!featuresList.featuresList.containsKey(terms[ind])) {
++size;
// features is from 1 (not zero), in the SVM-light
// format
featuresList.featuresList.put(terms[ind], new Long(size));
featuresList.idfFeatures.put(terms[ind], new Long(1));
}
else {
featuresList.idfFeatures.put(terms[ind], new Long(
(new Long(featuresList.idfFeatures.get(terms[ind]).toString()))
.longValue() + 1));
}
}
else {
System.out
.println("There are more NLP features from the training docuemnts");
System.out.println(" than the pre-defined maximal number"
+ new Long(ConstantParameters.MAXIMUMFEATURES));
return;
}
}
}
/** add the feature into docFV */
static void putFeatsIntoDocFV(NLPFeaturesList featuresList, String[] termC1,
String[] termC12, String[] termC13, DocFeatureVectors docFV) {
int num = termC1.length - 2; // all the characters in the text
int num11 = termC1.length;
docFV.numInstances = num;
docFV.fvs = new SparseFeatureVector[docFV.numInstances];
for(int ind = 0; ind < num; ++ind) {
String[] feats = new String[10];
// the single character feature
feats[0] = termC1[ind + 1]; // c0
feats[1] = termC1[ind]; // c-1
if(ind - 1 >= 0)
feats[2] = termC1[ind - 1]; // c-2
else feats[2] = ConstantParameters.NONFEATURE;
feats[3] = termC1[ind + 2]; // c1
if(ind + 3 < termC1.length)
feats[4] = termC1[ind + 3]; // c2
else feats[4] = ConstantParameters.NONFEATURE;
// the two-character feature
feats[5] = termC12[ind + 1]; // c0c1
feats[6] = termC12[ind]; // c-1c0
if(ind - 1 >= 0)
feats[7] = termC12[ind - 1]; // c-2c-1
else feats[7] = ConstantParameters.NONFEATURE;
if(ind + 2 < termC12.length)
feats[8] = termC12[ind + 2]; // c1c2
else feats[8] = ConstantParameters.NONFEATURE;
feats[9] = termC13[ind]; //c-1c1
/*
* System.out.print(ind+", feat="); for(int i=0; i<10; ++i)
* System.out.print(feats[i]+"("+i+") "); System.out.println();
*/
// get the features by using feature list
// StringBuffer fv = new StringBuffer();
docFV.fvs[ind] = new SparseFeatureVector(10);
//for(int i = 0; i < 10; i++) {
//System.out.print("*"+feats[i]+"("+featuresList.featuresList
// .get(feats[i]).toString()+")*");
//}
//System.out.println();
for(int i = 0; i < 10; i++) {
if(featuresList.featuresList.containsKey(feats[i])) {
docFV.fvs[ind].nodes[i].index = new Integer(featuresList.featuresList
.get(feats[i]).toString()).intValue()
+ i * ConstantParameters.MAXIMUMFEATURES;
docFV.fvs[ind].nodes[i].value = 1;
} else {
docFV.fvs[ind].nodes[i].index = i * ConstantParameters.MAXIMUMFEATURES;
docFV.fvs[ind].nodes[i].value = 0;
}
}
}
}
public void setModelURL(URL modelU) {
this.modelURL = modelU;
}
public URL getModelURL() {
return this.modelURL;
}
public void setTextFilesURL(URL modelU) {
this.textFilesURL = modelU;
}
public URL getTextFilesURL() {
return this.textFilesURL;
}
public RunMode getLearningMode() {
return this.learningMode;
}
public void setLearningMode(RunMode learningM) {
this.learningMode = learningM;
}
public String getTextCode() {
return this.textCode;
}
public void setTextCode(String tcode) {
this.textCode = tcode;
}
public String getLearningAlg() {
return this.learningAlg;
}
public void setLearningAlg(String la) {
this.learningAlg = la;
}
}