/* * LearningAPIMain.java * * Yaoyong Li 22/03/2007 * * $Id: LearningAPIMain.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $ */ package gate.learning; import gate.Document; import gate.Factory; import gate.ProcessingResource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.util.Benchmark; import gate.util.Benchmarkable; import gate.util.GateException; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.URISyntaxException; import java.net.URL; import java.util.Date; import java.util.HashMap; import org.apache.log4j.Logger; /** * The main object of the ML Api. It does initialiation, read parameter values * from GUI, and run the selected learning mode. It can also be called by java * code, as an API (an GATE class), for using this learning api. */ @SuppressWarnings("serial") public class LearningAPIMain extends AbstractLanguageAnalyser implements ProcessingResource, Benchmarkable { /** This is where the model(s) should be saved */ private URL configFileURL; /** * Name of the AnnotationSet contains annotations specified in the DATASET * element of configuration file. */ private String inputASName; /** * The annotationSet for the resulting annotations by application of models. */ private String outputASName; /** * Run-time parameter learningMode, having three modes: training, application, * and evaluation. */ private RunMode learningMode; private RunMode learningModeAppl; private RunMode learningModeMiTraining; private RunMode learningModeVIEWSVMMODEL; private RunMode learningModeSelectingDocs; /** Learning settings specified in the configuration file. */ private LearningEngineSettings learningSettings; /** * The lightweight learning object for getting the features, training and * application. */ LightWeightLearningApi lightWeightApi = null; /** The File for NLP learning Log. */ private File logFile; /** Used by lightWeightApi, specifying training or application. */ private boolean isTraining; /** Subdirectory for storing the data file produced by learning api. */ private File wdResults = null; /** Doing evaluation. */ private EvaluationBasedOnDocs evaluation; /** The MI learning information object. */ MiLearningInformation miLearningInfor = null; /** The three counters for batch application. */ int startDocIdApp; int endDocIdApp; int maxNumApp; /** Trivial constructor. */ public LearningAPIMain() { // do nothing } // featureMap that is used for exporting log messages protected java.util.Map benchmarkingFeatures = new HashMap(); /** Initialise this resource, and return it. */ public gate.Resource init() throws ResourceInstantiationException { fireStatusChanged("Checking and reading learning settings!"); // here all parameters are needs to be checked // check for the model storage directory if(configFileURL == null) throw new ResourceInstantiationException( "WorkingDirectory is required to store the learned model and cannot be null"); // it is not null, check it is a file: URL if(!"file".equals(configFileURL.getProtocol())) { throw new ResourceInstantiationException( "WorkingDirectory must be a file: URL"); } // Get the working directory which the configuration // file reside in. File wd = null; try { wd = new File(configFileURL.toURI()).getParentFile(); } catch(URISyntaxException use) { wd = new File(configFileURL.getFile()).getParentFile(); } // it must be a directory if(!wd.isDirectory()) { throw new ResourceInstantiationException(wd + " must be a reference to directory"); } if(LogService.minVerbosityLevel > 0) System.out.println("Configuration File=" + configFileURL.toString()); try { if(!new File(configFileURL.toURI()).exists()) { throw new ResourceInstantiationException( "Error: the configuration file specified does not exist!!"); } } catch(URISyntaxException e1) { e1.printStackTrace(); throw new ResourceInstantiationException(e1); } miLearningInfor = new MiLearningInformation(); try { // Load the learning setting file // by reading the configuration file learningSettings = LearningEngineSettings.loadLearningSettingsFromFile(configFileURL); } catch(Exception e) { throw new ResourceInstantiationException(e); } try { // Creat the sub-directory of the workingdirectroy where the data // files will be stored in if(LogService.minVerbosityLevel > 0) { System.out.println("\n\n*************************"); System.out.println("A new session for NLP learning is starting.\n"); } wdResults = new File(wd, gate.learning.ConstantParameters.SUBDIRFORRESULTS); wdResults.mkdir(); logFile = new File(new File(wd, ConstantParameters.SUBDIRFORRESULTS), ConstantParameters.FILENAMEOFLOGFILE); LogService.init(logFile, true, learningSettings.verbosityLogService); StringBuffer logMessage = new StringBuffer(); logMessage.append("\n\n*************************\n"); logMessage.append("A new session for NLP learning is starting.\n"); // adding WorkingDirectory parameter in the benchmarkingFeatures benchmarkingFeatures.put("workingDirectory", wd.getAbsolutePath()); logMessage.append("The initiliased time of NLP learning: " + new Date().toString() + "\n"); logMessage.append("Working directory: " + wd.getAbsolutePath() + "\n"); logMessage.append("The feature files and models are saved at: " + wdResults.getAbsolutePath() + "\n"); // Call the lightWeightLearningApi lightWeightApi = new LightWeightLearningApi(wd); // more initialisation lightWeightApi.furtherInit(wdResults, learningSettings); // adding WorkingDirectory parameter in the benchmarkingFeatures // benchmarkingFeatures.put("LearnerName", // learningSettings.learnerSettings.getLearnerName()); // benchmarkingFeatures.put("LearnerNickName", // learningSettings.learnerSettings.getLearnerNickName()); // benchmarkingFeatures.put("SurroundMode", learningSettings.surround); logMessage.append("Learner name: " + learningSettings.learnerSettings.getLearnerName() + "\n"); logMessage.append("Learner nick name: " + learningSettings.learnerSettings.getLearnerNickName() + "\n"); logMessage.append("Learner parameter settings: " + learningSettings.learnerSettings.learnerName + "\n"); logMessage.append("Surroud mode (or chunk learning): " + learningSettings.surround); LogService.logMessage(logMessage.toString(), 1); //LogService.close(); } catch(Exception e) { throw new ResourceInstantiationException(e); } learningModeAppl = RunMode.APPLICATION; maxNumApp = learningSettings.docNumIntevalApp; learningModeMiTraining = RunMode.MITRAINING; learningModeVIEWSVMMODEL = RunMode.VIEWPRIMALFORMMODELS; learningModeSelectingDocs = RunMode.RankingDocsForAL; fireProcessFinished(); return this; } // init() /** * Run the resource. * * @throws ExecutionException */ public void execute() throws ExecutionException { // mode in which the PR is executed benchmarkingFeatures.put("learningMode", learningMode); if(learningMode.equals(learningModeVIEWSVMMODEL)) { if(corpus == null || corpus.size() == 0 || corpus.indexOf(document) == 0) lightWeightApi.viewSVMmodelsInNLPFeatures(new File(wdResults, ConstantParameters.FILENAMEOFModels), learningSettings); return; } if(learningMode.equals(learningModeSelectingDocs)) { // for ordering and selecting the documents for ative learning if(corpus == null || corpus.size() == 0 || corpus.indexOf(document) == 0) { // ranking the documents lightWeightApi.orderDocsWithModels(wdResults, learningSettings); // selecting the document // lightWeightApi.selectDocForAL() } return; } // now we need to see if the corpus is provided if(corpus == null) throw new ExecutionException("Provided corpus is null!"); if(corpus.size() == 0) throw new ExecutionException("No Document found in corpus!"); // set benchmark ID on the lightWeightApi String oldLightWeightApiParentId = null; if(lightWeightApi instanceof Benchmarkable) { oldLightWeightApiParentId = lightWeightApi.getParentBenchmarkId(); lightWeightApi.createBenchmarkId(getBenchmarkId()); } // first, get the NLP features from the documents, according to the // feature types specified in DataSetDefinition file int positionDoc = corpus.indexOf(document); // first document in the corpus if(positionDoc == 0) { lightWeightApi.inputASName = inputASName; lightWeightApi.outputASName = outputASName; /** Obtain the MI learning information of the last time learning. */ if(learningMode.equals(this.learningModeMiTraining)) { miLearningInfor = new MiLearningInformation(); File miLeFile = new File(wdResults, ConstantParameters.FILENAMEOFMILearningInfor); long startTime = Benchmark.startPoint(); benchmarkingFeatures.put("miLearningInformationFile", miLeFile .getAbsolutePath()); miLearningInfor.readDataFromFile(miLeFile); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.READING_LEARNING_INFO, this, benchmarkingFeatures); benchmarkingFeatures.remove("miLearningInformationFile"); } /** Set the information for batch application. */ startDocIdApp = 0; endDocIdApp = 0; if(LogService.minVerbosityLevel > 0) System.out.println("Pre-processing the " + corpus.size() + " documents..."); try { // PrintWriter logFileIn = new PrintWriter(new FileWriter(logFile, // true)); LogService.init(logFile, true, learningSettings.verbosityLogService); LogService.logMessage("\n*** A new run starts.", 1); LogService.logMessage( "\nThe execution time (pre-processing the first document): " + new Date().toString(), 1); if(LogService.minVerbosityLevel > 0) { System.out.println("Learning starts."); System.out .println("For the information about this learning see the log file " + wdResults.getAbsolutePath() + File.separator + ConstantParameters.FILENAMEOFLOGFILE); System.out.println("The number of threads used is "+learningSettings.numThreadUsed); } //LogService.close(); // logFileIn.println("EvaluationMode: " + evaluationMode); // logFileIn.println("TrainingMode: " + trainingMode); // logFileIn.println("InputAS: " + inputASName); } catch(IOException e) { e.printStackTrace(); } } // Apply the model to a bunch of documents if(learningMode.equals(learningModeAppl)) { ++endDocIdApp; if(endDocIdApp - startDocIdApp == maxNumApp) { try { // first checking if the model file is available or not String modelFileName = wdResults.toString() + File.separator + ConstantParameters.FILENAMEOFModels; if(!new File(modelFileName).exists()) { System.out .println("Warning: the model is not available at the moment!!"); return; } BufferedWriter outNLPFeatures = null; BufferedReader inNLPFeatures = null; BufferedWriter outFeatureVectors = null; // EvaluationBasedOnDocs.emptyDatafile(wdResults, false); if(LogService.minVerbosityLevel > 0) System.out.println("** " + "Application mode for document from " + startDocIdApp + " to " + endDocIdApp + "(not included):"); LogService.logMessage("** Application mode for document from " + startDocIdApp + " to " + endDocIdApp + "(not included):", 1); isTraining = false; String classTypeOriginal = learningSettings.datasetDefinition.getClassAttribute().getType(); outNLPFeatures = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesData)), "UTF-8")); int numDoc; numDoc = endDocIdApp - startDocIdApp; long startTime = Benchmark.startPoint(); benchmarkingFeatures.put("numDocs", "" + numDoc); for(int i = startDocIdApp; i < endDocIdApp; ++i) { Document toProcess = (Document)corpus.get(i); lightWeightApi.annotations2NLPFeatures(toProcess, i - startDocIdApp, outNLPFeatures, isTraining, learningSettings); if(toProcess.getDataStore() != null && corpus.getDataStore() != null) {// (isDatastore) corpus.getDataStore().sync(corpus); Factory.deleteResource(toProcess); } } outNLPFeatures.flush(); outNLPFeatures.close(); lightWeightApi.finishFVs(wdResults, numDoc, isTraining, learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.ANNOTS_TO_NLP_FEATURES, this, benchmarkingFeatures); startTime = Benchmark.startPoint(); /** Open the normal NLP feature file. */ inNLPFeatures = new BufferedReader(new InputStreamReader( new FileInputStream(new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesData)), "UTF-8")); outFeatureVectors = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFFeatureVectorDataApp)), "UTF-8")); lightWeightApi.nlpfeatures2FVs(wdResults, inNLPFeatures, outFeatureVectors, numDoc, isTraining, learningSettings); inNLPFeatures.close(); outFeatureVectors.flush(); outFeatureVectors.close(); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.NLP_FEATURES_TO_FVS, this, benchmarkingFeatures); // Applying th model String fvFileName = wdResults.toString() + File.separator + ConstantParameters.FILENAMEOFFeatureVectorDataApp; startTime = Benchmark.startPoint(); lightWeightApi.applyModelInJava(corpus, startDocIdApp, endDocIdApp, classTypeOriginal, learningSettings, fvFileName); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.MODEL_APPLICATION, this, benchmarkingFeatures); benchmarkingFeatures.remove("numDocs"); startDocIdApp = endDocIdApp; } catch(IOException e) { e.printStackTrace(); } catch(GateException e) { e.printStackTrace(); } } } // we've reached the last document if(positionDoc == corpus.size() - 1) { // first select the training data and test data according to the // learning setting // set the inputASName in here, because it is a runtime parameter int numDoc = corpus.size(); try { LogService.init(logFile, true, learningSettings.verbosityLogService); LogService.logMessage("The learning start at " + new Date().toString(), 1); LogService.logMessage("The number of documents in dataset: " + numDoc, 1); // Open the NLP feature file for storing the NLP feature vectors BufferedWriter outNLPFeatures = null; BufferedReader inNLPFeatures = null; BufferedWriter outFeatureVectors = null; // if only need the feature data switch(learningMode){ case ProduceFeatureFilesOnly: // if only want feature data EvaluationBasedOnDocs.emptyDatafile(wdResults, true); if(LogService.minVerbosityLevel > 0) System.out.println("** Producing the feature files only!"); LogService.logMessage("** Producing the feature files only!", 1); long startTime = Benchmark.startPoint(); benchmarkingFeatures.put("numDocs", numDoc); isTraining = true; outNLPFeatures = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesData)), "UTF-8")); for(int i = 0; i < numDoc; ++i) { Document toProcess = (Document)corpus.get(i); lightWeightApi.annotations2NLPFeatures(toProcess, i, outNLPFeatures, isTraining, learningSettings); if(toProcess.getDataStore() != null && corpus.getDataStore() != null) Factory.deleteResource(toProcess); } outNLPFeatures.flush(); outNLPFeatures.close(); lightWeightApi.finishFVs(wdResults, numDoc, isTraining, learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.ANNOTS_TO_NLP_FEATURES, this, benchmarkingFeatures); /** Open the normal NLP feature file. */ inNLPFeatures = new BufferedReader(new InputStreamReader(new FileInputStream( new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesData)), "UTF-8")); outFeatureVectors = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFFeatureVectorData)), "UTF-8")); startTime = Benchmark.startPoint(); lightWeightApi.nlpfeatures2FVs(wdResults, inNLPFeatures, outFeatureVectors, numDoc, isTraining, learningSettings); inNLPFeatures.close(); outFeatureVectors.flush(); outFeatureVectors.close(); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.NLP_FEATURES_TO_FVS, this, benchmarkingFeatures); // produce the ngram language model from feature list if(LogService.minVerbosityLevel > 0) System.out .println("Write the language model in N-grams into the file " + ConstantParameters.FILENAMEOFNgramLM + "!"); LogService.logMessage( "Write the language model in N-grams into the file " + ConstantParameters.FILENAMEOFNgramLM + "!", 1); if(learningSettings.datasetDefinition.getNgrams().size() >= 1) { startTime = Benchmark.startPoint(); lightWeightApi.featureList2LM(wdResults, ((Ngram)learningSettings.datasetDefinition.getNgrams().get(0)) .getNumber()); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.WRITING_NGRAM_MODEL, this, benchmarkingFeatures); // produce the term-frequency matrix if(LogService.minVerbosityLevel > 0) System.out .println("Write the term-document statistics into the file " + ConstantParameters.FILENAMEOFTermFreqMatrix + "!"); LogService.logMessage( "Write the term-document statistics into the file " + ConstantParameters.FILENAMEOFTermFreqMatrix + "!", 1); startTime = Benchmark.startPoint(); lightWeightApi.termfrequenceMatrix(wdResults, numDoc); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.TERM_DOC_STATS, this, benchmarkingFeatures); } else { System.out .println("!! Warning: cannot produce N-gram data because there is no Ngram " + "defintion in the configuration file!"); } benchmarkingFeatures.remove("numDocs"); // Write the name of documents and total number of them into a file BufferedWriter outDocsName = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFDocsName)), "UTF-8")); outDocsName.append("##totalDocs=" + numDoc); outDocsName.newLine(); for(int i = 0; i < numDoc; ++i) { String docN = ((Document)corpus.get(i)).getName(); if(docN.contains("_")) docN = docN.substring(0, docN.lastIndexOf("_")); outDocsName.append(docN); outDocsName.newLine(); } outDocsName.flush(); outDocsName.close(); // Create the document for storing the names of selected documents // if it doesn't exist. File selectedFile = new File(wdResults, ConstantParameters.FILENAMEOFSelectedDOCForAL); if(!selectedFile.exists()) selectedFile.createNewFile(); if(LogService.minVerbosityLevel > 0) displayDataFilesInformation(); break; case TRAINING: // empty the data file Long tm1, tm2, tm3; if(LogService.DEBUG > 1) { tm1 = new Date().getTime(); } EvaluationBasedOnDocs.emptyDatafile(wdResults, true); if(LogService.minVerbosityLevel > 0) System.out.println("** Training mode:"); LogService.logMessage("** Training mode:", 1); startTime = Benchmark.startPoint(); benchmarkingFeatures.put("numDocs", "" + numDoc); isTraining = true; outNLPFeatures = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesData)), "UTF-8")); for(int i = 0; i < numDoc; ++i) { Document toProcess = (Document)corpus.get(i); lightWeightApi.annotations2NLPFeatures(toProcess, i, outNLPFeatures, isTraining, learningSettings); if(toProcess.getDataStore() != null && corpus.getDataStore() != null) Factory.deleteResource(toProcess); } outNLPFeatures.flush(); outNLPFeatures.close(); lightWeightApi.finishFVs(wdResults, numDoc, isTraining, learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.ANNOTS_TO_NLP_FEATURES, this, benchmarkingFeatures); if(LogService.DEBUG > 1) { tm2 = new Date().getTime(); tm3 = tm2 - tm1; tm3 /= 1000; System.out.println("time for NLP features: " + tm3); } /** Open the normal NLP feature file. */ inNLPFeatures = new BufferedReader(new InputStreamReader(new FileInputStream( new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesData)), "UTF-8")); outFeatureVectors = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFFeatureVectorData)), "UTF-8")); startTime = Benchmark.startPoint(); lightWeightApi.nlpfeatures2FVs(wdResults, inNLPFeatures, outFeatureVectors, numDoc, isTraining, learningSettings); inNLPFeatures.close(); outFeatureVectors.flush(); outFeatureVectors.close(); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.NLP_FEATURES_TO_FVS, this, benchmarkingFeatures); if(LogService.DEBUG > 1) { tm1 = new Date().getTime(); tm3 = tm1 - tm2; tm3 /= 1000; System.out.println("time for fv: " + tm3); } // if fitering the training data if(learningSettings.fiteringTrainingData && learningSettings.filteringRatio > 0.0) { startTime = Benchmark.startPoint(); lightWeightApi.FilteringNegativeInstsInJava(corpus.size(), learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.FILTERING, this, benchmarkingFeatures); } if(LogService.DEBUG > 1) { tm2 = new Date().getTime(); tm3 = tm2 - tm1; tm3 /= 1000; System.out.println("time for filtering: " + tm3); } startTime = Benchmark.startPoint(); // using the java code for training lightWeightApi.trainingJava(corpus.size(), learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.MODEL_TRAINING, this, benchmarkingFeatures); benchmarkingFeatures.remove("numDocs"); if(LogService.DEBUG > 1) { tm1 = new Date().getTime(); tm3 = tm1 - tm2; tm3 /= 1000; System.out.println("time for NLP training: " + tm3); } break; case APPLICATION: // first checking if the model file is available or not String modelFileName = wdResults.toString() + File.separator + ConstantParameters.FILENAMEOFModels; if(!new File(modelFileName).exists()) { System.out .println("Warning: the model is not available at the moment!!"); return; } if(endDocIdApp > startDocIdApp) { if(LogService.minVerbosityLevel > 0) System.out.println("** " + "Application mode for document from " + startDocIdApp + " to " + endDocIdApp + "(not included):"); LogService.logMessage("** Application mode for document from " + startDocIdApp + " to " + endDocIdApp + "(not included):", 1); isTraining = false; String classTypeOriginal = learningSettings.datasetDefinition.getClassAttribute() .getType(); outNLPFeatures = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesData)), "UTF-8")); numDoc = endDocIdApp - startDocIdApp; benchmarkingFeatures.put("numDocs", "" + numDoc); startTime = Benchmark.startPoint(); for(int i = startDocIdApp; i < endDocIdApp; ++i) { Document toProcess = (Document)corpus.get(i); lightWeightApi .annotations2NLPFeatures(toProcess, i - startDocIdApp, outNLPFeatures, isTraining, learningSettings); if(toProcess.getDataStore() != null && corpus.getDataStore() != null) {// (isDatastore) Factory.deleteResource(toProcess); corpus.getDataStore().sync(corpus); } } outNLPFeatures.flush(); outNLPFeatures.close(); lightWeightApi.finishFVs(wdResults, numDoc, isTraining, learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.ANNOTS_TO_NLP_FEATURES, this, benchmarkingFeatures); /** Open the normal NLP feature file. */ inNLPFeatures = new BufferedReader(new InputStreamReader(new FileInputStream( new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesData)), "UTF-8")); outFeatureVectors = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFFeatureVectorDataApp)), "UTF-8")); startTime = Benchmark.startPoint(); lightWeightApi.nlpfeatures2FVs(wdResults, inNLPFeatures, outFeatureVectors, numDoc, isTraining, learningSettings); inNLPFeatures.close(); outFeatureVectors.flush(); outFeatureVectors.close(); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.NLP_FEATURES_TO_FVS, this, benchmarkingFeatures); // Applying th model String fvFileName = wdResults.toString() + File.separator + ConstantParameters.FILENAMEOFFeatureVectorDataApp; startTime = Benchmark.startPoint(); lightWeightApi.applyModelInJava(corpus, startDocIdApp, endDocIdApp, classTypeOriginal, learningSettings, fvFileName); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.MODEL_APPLICATION, this, benchmarkingFeatures); benchmarkingFeatures.remove("numDocs"); // Update the datastore for the added annotations } break; case EVALUATION: if(LogService.minVerbosityLevel > 0) System.out.println("** Evaluation mode:"); LogService.logMessage("** Evaluation mode:", 1); evaluation = new EvaluationBasedOnDocs(corpus, wdResults, inputASName); benchmarkingFeatures.put("numDocs", corpus.size()); startTime = Benchmark.startPoint(); evaluation.evaluation(learningSettings, lightWeightApi); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.EVALUATION, this, benchmarkingFeatures); benchmarkingFeatures.remove("numDocs"); break; case MITRAINING: if(LogService.minVerbosityLevel > 0) System.out.println("** MITRAINING mode:"); LogService.logMessage("** MITRAINING mode:", 1); isTraining = true; benchmarkingFeatures.put("numDocs", "" + numDoc); startTime = Benchmark.startPoint(); /** * Need to write the NLP features into a temporary file, then copy * it into the NLP file. */ BufferedWriter outNLPFeaturesTemp = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesDataTemp)), "UTF-8")); for(int i = 0; i < numDoc; ++i) { lightWeightApi.annotations2NLPFeatures((Document)corpus.get(i), i, outNLPFeaturesTemp, isTraining, learningSettings); } outNLPFeaturesTemp.flush(); outNLPFeaturesTemp.close(); lightWeightApi.finishFVs(wdResults, numDoc, isTraining, learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.ANNOTS_TO_NLP_FEATURES, this, benchmarkingFeatures); lightWeightApi.copyNLPFeat2NormalFile(wdResults, miLearningInfor.miNumDocsTraining); /** * Use the temp NLP feature file instead of the normal one for * MI-training. */ inNLPFeatures = new BufferedReader(new InputStreamReader(new FileInputStream( new File(wdResults, ConstantParameters.FILENAMEOFNLPFeaturesDataTemp)), "UTF-8")); outFeatureVectors = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(wdResults, ConstantParameters.FILENAMEOFFeatureVectorData), true), "UTF-8")); startTime = Benchmark.startPoint(); lightWeightApi.nlpfeatures2FVs(wdResults, inNLPFeatures, outFeatureVectors, numDoc, isTraining, learningSettings); inNLPFeatures.close(); outFeatureVectors.flush(); outFeatureVectors.close(); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.NLP_FEATURES_TO_FVS, this, benchmarkingFeatures); System.gc(); // to make effort to delete the files. miLearningInfor.miNumDocsTraining += numDoc; miLearningInfor.miNumDocsFromLast += numDoc; if(miLearningInfor.miNumDocsFromLast >= learningSettings.miDocInterval) { // Start learning // if fitering the training data if(learningSettings.fiteringTrainingData && learningSettings.filteringRatio > 0.0) { benchmarkingFeatures.put("numDocs", miLearningInfor.miNumDocsTraining + ""); startTime = Benchmark.startPoint(); lightWeightApi.FilteringNegativeInstsInJava( miLearningInfor.miNumDocsTraining, learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + Benchmark.FILTERING, this, benchmarkingFeatures); } startTime = Benchmark.startPoint(); // using the java code for training lightWeightApi.trainingJava(miLearningInfor.miNumDocsTraining, learningSettings); Benchmark.checkPoint(startTime, getBenchmarkId() + "." + "." + Benchmark.MODEL_TRAINING, this, benchmarkingFeatures); benchmarkingFeatures.remove("numDocs"); // Reset the num from last training as 0 miLearningInfor.miNumDocsFromLast = 0; } File miLeFile = new File(wdResults, ConstantParameters.FILENAMEOFMILearningInfor); miLearningInfor.writeDataIntoFile(miLeFile); break; default: throw new GateException("The learning mode is not defined!"); } LogService.logMessage("This learning session finished!.", 1); //LogService.close(); } catch(IOException e) { e.printStackTrace(); } catch(GateException e) { e.printStackTrace(); } // reset the parentBenchmarkID if(oldLightWeightApiParentId != null) { lightWeightApi.setParentBenchmarkId(oldLightWeightApiParentId); } if(LogService.minVerbosityLevel > 0) System.out.println("This learning session finished!."); } // end of learning (position=corpus.size()-1) } /** Print out the information for featureData only option. */ private void displayDataFilesInformation() { StringBuffer logMessage = new StringBuffer(); logMessage.append("NLP features for all the documents are in the file" + wdResults.getAbsolutePath() + File.separator + ConstantParameters.FILENAMEOFNLPFeaturesData + "\n"); logMessage.append("Feature vectors in sparse format are in the file" + wdResults.getAbsolutePath() + File.separator + ConstantParameters.FILENAMEOFFeatureVectorData + "\n"); logMessage.append("Label list is in the file" + wdResults.getAbsolutePath() + File.separator + ConstantParameters.FILENAMEOFLabelList + "\n"); logMessage.append("NLP features list is in the file" + wdResults.getAbsolutePath() + File.separator + ConstantParameters.FILENAMEOFNLPFeatureList + "\n"); logMessage .append("The statistics of entity length for each class is in the file" + wdResults.getAbsolutePath() + File.separator + ConstantParameters.FILENAMEOFChunkLenStats + "\n"); System.out.println(logMessage.toString()); LogService.logMessage(logMessage.toString(), 1); } public void setConfigFileURL(URL workingDirectory) { this.configFileURL = workingDirectory; } public URL getConfigFileURL() { return this.configFileURL; } public void setInputASName(String iasn) { this.inputASName = iasn; } public String getInputASName() { return this.inputASName; } public void setOutputASName(String iasn) { this.outputASName = iasn; } public String getOutputASName() { return this.outputASName; } public RunMode getLearningMode() { return this.learningMode; } public void setLearningMode(RunMode learningM) { this.learningMode = learningM; } public EvaluationBasedOnDocs getEvaluation() { return evaluation; } public EvaluationBasedOnDocs setEvaluation(EvaluationBasedOnDocs eval) { return this.evaluation = eval; } // /////// Benchmarkable //////////////// private String parentBenchmarkID; private String benchmarkID; /** * Returns the benchmark ID of the parent of this resource. * * @return */ public String getParentBenchmarkId() { return this.parentBenchmarkID; } /** * Returns the benchmark ID of this resource. * * @return */ public String getBenchmarkId() { if(this.benchmarkID == null) { benchmarkID = getName().replaceAll("[ ]+", "_"); ; } return this.benchmarkID; } /** * Given an ID of the parent resource, this method is responsible for * producing the Benchmark ID, unique to this resource. * * @param parentID */ public void createBenchmarkId(String parentID) { parentBenchmarkID = parentID; benchmarkID = Benchmark.createBenchmarkId(getName(), parentID); } /** * This method sets the benchmarkID for this resource. * * @param benchmarkID */ public void setParentBenchmarkId(String benchmarkID) { parentBenchmarkID = benchmarkID; } /** * Returns the logger object being used by this resource. * * @return */ public Logger getLogger() { return Benchmark.logger; } public void setBenchmarkId(String arg0) { // stub } }