/* * LearningEngineSettings.java * * Yaoyong Li 22/03/2007 * * $Id: LearningEngineSettings.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $ */ package gate.learning; import java.io.File; import java.net.URISyntaxException; import java.util.Iterator; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ThreadFactory; import org.jdom.Element; import org.jdom.input.SAXBuilder; import gate.creole.ResourceInstantiationException; import gate.util.GateException; /** * Reading and storing the learning settings from the configuration file. */ public class LearningEngineSettings { /** Storing date set definition. */ public DataSetDefinition datasetDefinition; /** Number of threads used **/; int numThreadUsed = 1; /** Threshold of the probability for the boundary token of chunk. */ float thrBoundaryProb = 0.4f; /** The threshold of the probability for the chunk. */ float thrEntityProb = 0.2f; /** The threshold of the probability for classifation. */ float thrClassificationProb = 0.2f; /** Name used in the configuration file for boundary token prob threshold. */ final static String thrBoundaryProbStr = "thresholdProbabilityBoundary"; /** Name used in the configuration file for entity prob threshold. */ final static String thrEntityProbStr = "thresholdProbabilityEntity"; /** Name used in the configuration file for classification prob. threshold. */ final static String thrClassificationProbStr = "thresholdProbabilityClassification"; /** * Two difference methods of converting multi-class problem into binary class * problems. */ short multi2BinaryMode = 1; /** One against others. */ public final static short OneVSOtherMode = 1; /** One against Another one. */ public final static short OneVSAnotehrMode = 2; /** * Name used in the configuration file for the the method of multi to binary * mode conversion. */ final static String multi2BinaryN = "multiClassification2BinaryMethod"; /** * Executor used to run the individual binary training and classification * tasks in multi-binary mode. If this is not set, {@link MultiClassLearning} * will simply run the tasks sequentially in its main thread, but as the * binary tasks are independent they could instead be run in a thread pool. */ public ExecutorService multiBinaryExecutor; /** The settings of learner specified. */ public LearnerSettings learnerSettings; /** The surround mode. */ boolean surround; /** The option if the label list is updatable. */ public boolean isLabelListUpdatable = true; /** The option if the NLP Feature List is updatable. */ public boolean isNLPFeatListUpdatable = true; /** The option if doing filtering training data or not. */ public boolean fiteringTrainingData = false; /** * Ratio of negative examples filiterd out to the total number of negative * exampels in training set. */ public float filteringRatio = 0.0f; /** * Filtering the negative examples which are nearest to classification * hyper-plane or furthest from. */ public boolean filteringNear = false; /** The setting for evaluation. */ public EvaluationConfiguration evaluationconfig = null; /** Number of document as interval between trainings in MI-learning mode. */ public int miDocInterval = 1; /** The document number interval for one applicataion in batch learning mode. */ public int docNumIntevalApp = 100; /** * Define the number of the NLP features with the biggest weights in linear * SVM model. */ public int numPosSVMModel; /** * Define the number of the NLP features with the smallest weight in linear * SVM model. */ public int numNegSVMModel; /** Active learning settings. */ ActiveLearningSetting alSetting; /** * The verbosity level for writing information into log file. 0: no real * output. 1: normal output including results and setting information. 2: * warning information. */ public int verbosityLogService = LogService.NORMAL; /** Loading the learning settings from the configuration file. */ public static LearningEngineSettings loadLearningSettingsFromFile( java.net.URL xmlengines) throws GateException { SAXBuilder saxBuilder = new SAXBuilder(false); org.jdom.Document jdomDoc = null; try { jdomDoc = saxBuilder.build(xmlengines); } catch(Exception e) { } Element rootElement = jdomDoc.getRootElement(); if(!rootElement.getName().equals("ML-CONFIG")) throw new ResourceInstantiationException( "Root element of dataset defintion file is \"" + rootElement.getName() + "\" instead of \"ML-CONFIG\"!"); // Create a learning setting object LearningEngineSettings learningSettings = new LearningEngineSettings(); learningSettings.surround = false; if(rootElement.getChild("SURROUND") != null) { String value = rootElement.getChild("SURROUND").getAttribute("value") .getValue(); learningSettings.surround = "true".equalsIgnoreCase(value); } /** Set the number of documents as training interval for mi-learning. */ learningSettings.miDocInterval = 1; if(rootElement.getChild("MI-TRAINING-INTERVAL") != null) { String value = rootElement.getChild("MI-TRAINING-INTERVAL").getAttribute( "num").getValue(); learningSettings.miDocInterval = Integer.parseInt(value); } /** Set the number of documents as interval for batch application. */ learningSettings.docNumIntevalApp = 100; if(rootElement.getChild("BATCH-APP-INTERVAL") != null) { String value = rootElement.getChild("BATCH-APP-INTERVAL").getAttribute( "num").getValue(); learningSettings.docNumIntevalApp = Integer.parseInt(value); } /** Get the setting for verbosity. */ learningSettings.verbosityLogService = LogService.NORMAL; if(rootElement.getChild("VERBOSITY") != null) { String value = rootElement.getChild("VERBOSITY").getAttribute("level") .getValue(); learningSettings.verbosityLogService = Integer.parseInt(value); } learningSettings.fiteringTrainingData = false; learningSettings.filteringRatio = 0.0f; learningSettings.filteringNear = false; if(rootElement.getChild("FILTERING") != null) { String value = rootElement.getChild("FILTERING").getAttribute("ratio") .getValue(); learningSettings.filteringRatio = Float.parseFloat(value); value = rootElement.getChild("FILTERING").getAttribute("dis").getValue(); learningSettings.filteringNear = "near".equalsIgnoreCase(value); learningSettings.fiteringTrainingData = true; } learningSettings.isLabelListUpdatable = true; if(rootElement.getChild("IS-LABEL-UPDATABLE") != null) { String value = rootElement.getChild("IS-LABEL-UPDATABLE").getAttribute( "value").getValue(); learningSettings.isLabelListUpdatable = "true".equalsIgnoreCase(value); } learningSettings.isNLPFeatListUpdatable = true; if(rootElement.getChild("IS-NLPFEATURELIST-UPDATABLE") != null) { String value = rootElement.getChild("IS-NLPFEATURELIST-UPDATABLE") .getAttribute("value").getValue(); learningSettings.isNLPFeatListUpdatable = "true".equalsIgnoreCase(value); } learningSettings.multi2BinaryMode = 1; if(rootElement.getChild("multiClassification2Binary") != null) { Element mc2b = rootElement.getChild("multiClassification2Binary"); String value = mc2b.getAttribute("method").getValue(); if(value.equalsIgnoreCase("one-vs-another")) learningSettings.multi2BinaryMode = 2; // thread-pool-size attribute causes multi-binary learning to use // a pool of threads to run the binary learning tasks, rather than // running them sequentially. This can give a big speedup on large // training sets with a multi-processor machine. String threadPoolSize = mc2b.getAttributeValue("thread-pool-size"); if(threadPoolSize != null) { try { int poolSize = Integer.parseInt(threadPoolSize); learningSettings.numThreadUsed = poolSize; // override the default thread factory with one that returns daemon // threads // so as not to stop the VM from exiting learningSettings.multiBinaryExecutor = Executors.newFixedThreadPool( poolSize, new ThreadFactory() { private ThreadFactory fac = Executors.defaultThreadFactory(); public Thread newThread(Runnable r) { Thread t = fac.newThread(r); t.setDaemon(true); return t; } }); } catch(NumberFormatException nfe) { throw new ResourceInstantiationException(threadPoolSize + " is not a valid thread-pool-size: integer expected"); } } else { learningSettings.numThreadUsed = 1; } } // Read the parameter for displaying the NLP features from linear SVM model learningSettings.numPosSVMModel = 10; learningSettings.numNegSVMModel = 0; if(rootElement.getChild("DISPLAY-NLPFEATURES-LINEARSVM") != null) { String value = rootElement.getChild("DISPLAY-NLPFEATURES-LINEARSVM") .getAttribute("numP").getValue(); if(value != null) learningSettings.numPosSVMModel = Integer.parseInt(value); value = rootElement.getChild("DISPLAY-NLPFEATURES-LINEARSVM") .getAttribute("numN").getValue(); if(value != null) learningSettings.numNegSVMModel = Integer.parseInt(value); } // for active learning setting learningSettings.alSetting = new ActiveLearningSetting(); if(rootElement.getChild("ACTIVELEARNING") != null) { String value = rootElement.getChild("ACTIVELEARNING").getAttributeValue( "numTokensPerDoc"); learningSettings.alSetting.numTokensSelect = Integer.parseInt(value); } // Read the evaluation method: k-fold CV or k-run hold-out try { Element evalelem = rootElement.getChild("EVALUATION"); if(evalelem != null) learningSettings.evaluationconfig = EvaluationConfiguration .fromXML(evalelem); else { System.out .println("! Warning no evaluation scheme is specified. So it will use the default scheme."); learningSettings.evaluationconfig = new EvaluationConfiguration(); } } catch(RuntimeException e) { } // Loading the dataset definition try { Element datasetElement = rootElement.getChild("DATASET"); learningSettings.datasetDefinition = new DataSetDefinition(datasetElement); } catch(Exception e) { throw new GateException( "The DSD element in the configureation file is missing or invalid"); } // Threshold settings Iterator parameters = rootElement.getChildren("PARAMETER").iterator(); while(parameters.hasNext()) { Element paramelem = (Element)parameters.next(); String name = paramelem.getAttribute("name").getValue(); String value = paramelem.getAttribute("value").getValue(); if(name.equals(LearningEngineSettings.thrBoundaryProbStr)) learningSettings.thrBoundaryProb = Float.parseFloat(value); if(name.equals(LearningEngineSettings.thrEntityProbStr)) learningSettings.thrEntityProb = Float.parseFloat(value); if(name.equals(LearningEngineSettings.thrClassificationProbStr)) learningSettings.thrClassificationProb = Float.parseFloat(value); } // read the setting for the engine by creating a learner subject learningSettings.learnerSettings = new LearnerSettings(); Element UEelement = rootElement.getChild("ENGINE"); if(UEelement == null) System.out .println("!! Warning: the Engine element in the configureation file is missing or invalid. " + "You CANNOT learn and apply model, but it's OK for producing the feature files."); else { if(UEelement.getAttribute("nickname") != null) learningSettings.learnerSettings.learnerNickName = UEelement .getAttribute("nickname").getValue(); else learningSettings.learnerSettings.learnerNickName = "A_Learner"; if(UEelement.getAttribute("implementationName") != null) learningSettings.learnerSettings.learnerName = UEelement.getAttribute( "implementationName").getValue(); else throw new GateException("The ENGINE element in the configuration " + "does not specify the leaner's name!"); if(UEelement.getAttribute("options") != null) learningSettings.learnerSettings.paramsOfLearning = UEelement .getAttribute("options").getValue(); if(UEelement.getAttribute("executableTraining") != null) learningSettings.learnerSettings.executableTraining = UEelement .getAttribute("executableTraining").getValue(); } return learningSettings; } }