/* * Copyright (c) 2004, The University of Sheffield. * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * * Mike Dowman 08-04-2004 * * $Id: SVMLightWrapper.java 7452 2006-06-15 14:45:17 +0000 (Thu, 15 Jun 2006) ian_roberts $ * */ package gate.creole.ml.svmlight; import gate.creole.ml.*; import gate.util.GateException; import gate.creole.ExecutionException; import gate.gui.MainFrame; import java.util.List; import java.io.File; import java.io.IOException; /** * Wrapper class for the SVM Light support vector machine learning algorithm. * The executable files, SVM_Learn and SVM_Classify must be placed on your path * in order for this wrapper to work. * {@ see http://svmlight.joachims.org/} */ public class SVMLightWrapper implements AdvancedMLEngine, gate.gui.ActionsPublisher { static boolean DEBUG = false; /** * This constructor sets up action list so that these actions (loading and * saving models and data) will be available from a context menu in the * gui). */ public SVMLightWrapper() { actionsList = new java.util.ArrayList(); actionsList.add(new LoadDatasetAction()); actionsList.add(new SaveDatasetAction()); actionsList.add(new LoadModelAction()); actionsList.add(new SaveModelAction()); actionsList.add(null); } /** * Delete all the temporary files when the processing resource is closed. */ public void cleanUp() { try { trainingDataFile.delete(); testDataFile.delete(); modelFile.delete(); resultsFile.delete(); } catch (java.lang.SecurityException ex) { // If an exception is thrown just do nothing. } } /** * Take a representation of the part of the XML configuration file * which corresponds to <OPTIONS>, and store it. * * @throws GateException */ public void setOptions(org.jdom.Element optionsElem) { this.optionsElement = optionsElem; } /** * See if any <CLASSIFIER-OPTIONS> are specified in the congif file. If * such an element exists, then extract the string of options and store it. * Otherwise set classifierOptions to the empty string. * * This is the only configuration file option for SVM Light. */ private void extractAndCheckOptions() { if (optionsElement == null) { classifierOptions = ""; return; } classifierOptions = optionsElement.getChildTextTrim("CLASSIFIER-OPTIONS"); if (classifierOptions == null) classifierOptions = ""; } /** * This is called to add a new training instance to the data set collected * in this wrapper object. * * @param attributeValues A list of String objects, each of which corresponds * to an attribute value. For boolean attributes the values will be true or * false. */ public void addTrainingInstance(List attributeValues) { trainingData.add(attributeValues); datasetChanged = true; } /** * Set the data set defition for this classifier. * * @param definition A specification of the types and allowable values of * all the attributes, as specified in the <DATASET> part of the * configuration file. */ public void setDatasetDefinition(DatasetDefintion definition) { this.datasetDefinition = definition; } /** * Tests that the attributes specified in the DatasetDefinition are valid for * SVM Light. That is that the class attribute is boolean, numeric, or is a * two or three value nominal. (In the case of a three value nominal, the * first value will be taken to be positive (1), the second negative (-1) * and the third indicates that no value is known for this instance, and * that transduction should be used instead (0). A two value nominal has * only the first of these two possibilities. */ private void checkDatasetDefinition() throws gate.creole. ResourceInstantiationException { // Just find the class attribute, and check that it's the right kind. Attribute classAttribute = (Attribute) datasetDefinition.getClassAttribute(); if (classAttribute.semanticType() == Attribute.NOMINAL) { if (classAttribute.getValues().size() != 2 && classAttribute.getValues().size() != 3) { throw new gate.creole.ResourceInstantiationException( "Error in SVM Light configuration file. The <CLASS> attribute " + "must be boolean, numeric or a two or three valued nominal."); } } } /** * Decide on the outcome for the instance, based on the values of all the * features. * * N.B. Unless this function was previously called, and there has been no new * data added since, the model will be trained when it is called. This could * result in calls to this function taking a long time to execute. * * @param attributeValues A list of all the attributes, including the * <CLASS/> attribute. The value of the <CLASS/> attribute is, * however, arbitrary. * * @return A string value giving the nominal value of the class or, if the * outcome is boolean, a java String with value "true" or "false", or if the * 'class' is numeric, the estimated numeric value for class. * * @throws ExecutionException */ public Object classifyInstance(List attributeValues) throws ExecutionException { try { // If no training examples have been added yet. if (trainingData.size() == 0) throw new ExecutionException("An attempt has been made to use an SVM " + "Light model to classify data before the " + "model was been trained. At least one " + "training example must be provided."); // First we need to check whether we need to create a new model. // If either we've never made a model, or some new data has been added, then // we need to train a new model. if (!modelTrained || datasetChanged) { initialiseAndTrainClassifier(); } // The data now reflects the model, so keep a note of this so we don't // have to retrain the model if using the same data. datasetChanged = false; // Before we classify, we need to save the test data to the disk. saveTestInstanceToDisk(attributeValues); // Then try to classify stuff. There is no option to use a confidence // threshold, so we will just get a simple prediction for the class. // This function call returns a string or double. // First convert the attribute values to an SVM Light document (the value // given to the class attribute is arbitrary. java.lang.Process svmLightProcess; try { svmLightProcess = (Runtime.getRuntime()).exec(new String[] {"svm_classify", "-v", "0", testDataFile.getPath(), modelFile.getPath(), resultsFile.getPath()}); // We need to read the standard output and error streams, otherwise // the process won't run. java.io.BufferedReader stdOutput = new java.io.BufferedReader(new java.io.InputStreamReader(svmLightProcess.getInputStream())); java.io.BufferedReader stdError = new java.io.BufferedReader(new java.io.InputStreamReader(svmLightProcess.getErrorStream())); // Display any standard output or error. (The verbosity can be set to // zero which normally stops there being any standard output.) String string; while ( (string = stdOutput.readLine()) != null) System.out.println(string); while ( (string = stdError.readLine()) != null) System.out.println(string); // Then wait until the process has completely finished. svmLightProcess.waitFor(); } catch (Exception ex) { modelTrained = false; ex.printStackTrace(); throw new gate.util.GateRuntimeException("Exception occured when an " + "attempt was made to run " + "svm_classify.\n"); } // Check that the processes exit code is normal. if (svmLightProcess.exitValue() != 0) { modelTrained = false; throw new RuntimeException("svm_classify did not exit normally when " + "called as an external command."); } double classificationResult; // Extract the class value and assign it to classificatin result. try { classificationResult = extractResultFromResultsFile(); } catch (Exception ex) { throw new gate.util.GateRuntimeException( "Error when reading the result of " + "classification from the results file."); } return classificationResult2GateFormat(classificationResult); } catch (java.io.IOException ex) { throw new ExecutionException(ex); } } // classifyInstance /** * Decide on the outcomes for all the instances, based on the values of all * the features for each of the instances in a document. * * N.B . Unless this function was previously called, and there has been no new * data added since, the model will be trained when it is called. This could * result in calls to this function taking a long time to execute. * * @param attributeValues A list of lists of all the attributes, (one list * per instance) including the <CLASS/>attribute. The value of the * <CLASS/>attribute is, however, arbitrary. * * @return A list of string values giving the nominal value of the class or, * if the outcome is boolean, a java String with value "true" or "false", or * if the 'class' is numeric, the estimated numeric value for class. * * @throws ExecutionException */ public List batchClassifyInstances(List instances) throws ExecutionException { try { // If no training examples have been added yet. if (trainingData.size() == 0) throw new ExecutionException("An attempt has been made to use an SVM " + "Light model to classify data before the " + "model was been trained. At least one " + "training example must be provided."); // First we need to check whether we need to create a new model. // If either we've never made a model, or some new data has been added, then // we need to train a new model. if (!modelTrained || datasetChanged) { initialiseAndTrainClassifier(); } // The data now reflects the model, so keep a note of this so we don't // have to retrain the model if using the same data. datasetChanged = false; // Before we classify, we need to save all the test data to the disk. saveAllTestInstancesToDisk(instances); // Then try to classify stuff. There is no option to use a confidence // threshold, so we will just get a simple prediction for the class. // This function call returns a string or double. // First convert the attribute values to an SVM Light document (the value // given to the class attribute is arbitrary. java.lang.Process svmLightProcess; try { svmLightProcess = (Runtime.getRuntime()).exec(new String[] {"svm_classify", "-v", "0", testDataFile.getPath(), modelFile.getPath(), resultsFile.getPath()}); // We need to read the standard output and error streams, otherwise // the process won't run. java.io.BufferedReader stdOutput = new java.io.BufferedReader(new java.io.InputStreamReader(svmLightProcess.getInputStream())); java.io.BufferedReader stdError = new java.io.BufferedReader(new java.io.InputStreamReader(svmLightProcess.getErrorStream())); // Display any standard output or error. (The verbosity can be set to // zero which normally stops there being any standard output.) String string; while ( (string = stdOutput.readLine()) != null) System.out.println(string); while ( (string = stdError.readLine()) != null) System.out.println(string); // Then wait until the process has completely finished. svmLightProcess.waitFor(); } catch (Exception ex) { modelTrained = false; ex.printStackTrace(); throw new gate.util.GateRuntimeException("Exception occured when an " + "attempt was made to run " + "svm_classify.\n"); } // Check that the processes exit code is normal. if (svmLightProcess.exitValue() != 0) { modelTrained = false; throw new RuntimeException("svm_classify did not exit normally when " + "called as an external command."); } // Get all the results out of the results file, convert them to GATE // format, put them in a list, and return the list. java.util.List Results = extractAllResultsFromResultsFile(); return Results; } catch (java.io.IOException ex) { throw new ExecutionException(ex); } } // batchClassifyInstances private List extractAllResultsFromResultsFile() { try { // First open the file. java.io.FileReader reader = new java.io.FileReader(resultsFile.getCanonicalPath()); MyStringReader inputFile = new MyStringReader(reader); reader.close(); // Then extract all the results, converting them to gate format as we go. List classificationResults = new java.util.ArrayList(); while (inputFile.endOfFileReached() == false) { String result = inputFile.readItem(); inputFile.skipToStartOfNextLine(); classificationResults.add(classificationResult2GateFormat (Double.parseDouble(result))); } return classificationResults; } catch (Exception ex) { ex.printStackTrace(); throw new gate.util.GateRuntimeException( "Error when reading the results of a batch " + "classification from the results file."); } } // extractAllResultsFromResultsFile /** * Just take an instance, in the form of a list of values (as passed from * the machine learning PR), and save it to disk in SVM Light format, so that * it can be read by svm_classify * * @param attributeValues The training instance in the form received from * the machine learning PR. */ private void saveTestInstanceToDisk(java.util.List attributeValues) throws java.io.IOException, gate.creole.ExecutionException { java.io.FileWriter fileWriter = new java.io.FileWriter(testDataFile.getCanonicalPath(), false); fileWriter.write(new SVMLightDocument(datasetDefinition, nominalValue2IntegerHash, attributeValues ).toString()); fileWriter.close(); } /** * Take a list of instances, in the form of a list of lists of values, in * the form passed from the machine learning PR, and save it to disk in SVM * Light format, so that it can be read by svm_classify. * * @param instances The list of training instances. * @throws java.io.IOException * @throws gate.creole.ExecutionException */ private void saveAllTestInstancesToDisk(List instances) throws java.io.IOException, gate.creole.ExecutionException { java.io.FileWriter fileWriter = new java.io.FileWriter(testDataFile.getCanonicalPath(), false); saveDataset(fileWriter, instances); fileWriter.close(); } /** * Just open the file, and read the number it contains on its first line, * converting it to a double. * * @return The value on the first line of the results file. */ private double extractResultFromResultsFile() throws java.io.IOException { java.io.FileReader reader = new java.io.FileReader(resultsFile.getCanonicalPath()); MyStringReader inputFile=new MyStringReader(reader); String classValue=inputFile.readItem(); reader.close(); return Double.parseDouble(classValue); } /** * This function converts back from classification results to the format in * used for attributes by GATE. It is the converse of the conversion code * found in SVMLightDocument. */ private Object classificationResult2GateFormat( double classificationResult) { gate.creole.ml.Attribute classAttribute = datasetDefinition.getClassAttribute(); // Numeric attributes need no conversion, except that they need to be // objects to be returned. if (classAttribute.semanticType() == gate.creole.ml.Attribute.NUMERIC) { return new Double(classificationResult); // Boolean attributes need to be converted to strings, depending on the // sign of the returned double. } else if (classAttribute.semanticType() == gate.creole.ml.Attribute.BOOLEAN) { if (classificationResult < 0) { return new String("false"); } else { return new String("true"); } } else { // Otherwise it must be a Nominal attribute. // As it is a class attribute, it will be a two or three value nominal, // but only the first two values (true and false) can be returned as a // result of classification. if (classificationResult < 0) { // If result is negative, get the second nominal value, and return it. return (String) classAttribute.getValues().get(1); } else { // Otherwise must be a positive result, so return the first nominal // value. return (String) classAttribute.getValues().get(0); } } } /** * Use svm_learn to create a new svm model, based on all the data currently * stored in the wrapper. * * @throws gate.creole.ExecutionException */ public void initialiseAndTrainClassifier() throws gate.creole. ExecutionException, java.io.IOException { if (DEBUG) System.out.println("Entering initialise and train classifier"); // First save the dataset to disk, so svm_learn can access it. java.io.FileWriter fileWriter = new java.io.FileWriter(trainingDataFile.getCanonicalPath(), false); saveDataset(fileWriter, trainingData); fileWriter.close(); if (DEBUG) System.out.println("Training dataset saved to disk"); java.lang.Process svmLightProcess; try { svmLightProcess = (Runtime.getRuntime()).exec( optionsString2OptionsList( getSVMLightClassificationOrRegressionOption() + classifierOptions)); if (DEBUG) System.out.println("SVM_LEARN process started"); // We need to read the standard output and error streams, otherwise // the process won't run. java.io.BufferedReader stdOutput = new java.io.BufferedReader(new java.io.InputStreamReader(svmLightProcess.getInputStream())); java.io.BufferedReader stdError = new java.io.BufferedReader(new java.io.InputStreamReader(svmLightProcess.getErrorStream())); if (DEBUG) System.out.println("output streams created"); // Display any standard output or error. (The verbosity can be set to // zero which normally stops there being any standard output.) String string; while ( (string = stdOutput.readLine()) != null) { if (DEBUG) System.out.println("Printing output"); System.out.println(string); } while ( (string = stdError.readLine()) != null) { if (DEBUG) System.out.println("Printing error"); System.out.println(string); } // Wait until the process has completely finished. svmLightProcess.waitFor(); if(DEBUG) System.out.println("SVM LEARN PROCESS FINISHED"); } catch (Exception ex) { modelTrained = false; ex.printStackTrace(); throw new gate.util.GateRuntimeException("Exception occured when an " + "attempt was made to run " + "svm_learn.\n"); } // Check that the processes exit code is normal. if (svmLightProcess.exitValue() != 0) { modelTrained = false; throw new RuntimeException("svm_learn did not exit normally when " + "called as an external command."); } modelTrained = true; if (DEBUG) System.out.println("Leaving initialise and train classifier"); } /** * Takes options for SVM_Learn in string form, and converts them to the form * required by java for calling system functions, adding on the required * filenames for data and model to the end of the list of options. * * @param options The list of options in the form of a string, separated by * tabs or spaces. * @return A list with one options per entry, in the same order as they * appeared in the input string, followed by the name of the data file, and * finally the name of the model file. (N.B. options here means each item * separated by white space - in reality an svm light option is two of these * options.) */ java.lang.String[] optionsString2OptionsList(String optionsString) { String[] optionsArray1 = optionsString.split("\\s"); // Make an array with enough space for all the options plus the two // filenames, plus the name of the command. java.lang.String[] optionsArray= new java.lang.String[optionsArray1.length+3]; optionsArray[0] = "svm_learn"; for (int i=0; i < optionsArray1.length; ++i) { optionsArray[i+1] = optionsArray1[i]; } optionsArray[optionsArray.length-2] = trainingDataFile.getPath(); optionsArray[optionsArray.length-1] = modelFile.getPath(); if (DEBUG) { System.out.println("Options array contents are:"); for (int i=0; i<optionsArray.length; ++i) { System.out.println(optionsArray[i]); } } return optionsArray; } /** * Get the SVM Light command line option specifying whether we are doing * regression or classification. * @return The String "-z r " if the class attribute is numeric, otherwise * the String "-z c ". */ String getSVMLightClassificationOrRegressionOption() { if (datasetDefinition.getClassAttribute().semanticType() == Attribute.NUMERIC) { return "-z r "; } else { return "-z c "; } } /** * Convert the training data to a form which can be passed to the native * code function, and which is closer to that required by SVMLight itself. * N.B. SVM Light calls documents instances. * * @return An array of instances (a.k.a. documents), containing all the * training data. */ private SVMLightDocument[] convertTrainingDataToArrayOfDocuments() throws gate.creole.ExecutionException { // Create an array with one element for each training instance. SVMLightDocument[] documents = new SVMLightDocument[trainingData.size()]; int indexInArray = 0; java.util.Iterator trainingDataIterator = trainingData.iterator(); while (trainingDataIterator.hasNext()) { documents[indexInArray] = new SVMLightDocument(datasetDefinition, nominalValue2IntegerHash, (java.util.List) trainingDataIterator.next()); ++indexInArray; } return documents; } /** * Initialises the classifier and prepares for running. Before calling this * method, the datasetDefinition and optionsElement fields should have been * set using calls to the appropriate methods. * * It also creates temporary files needed for passing data to and from * SVMLight. * * @throws GateException If it is not possible to initialise the classifier * for any reason. */ public void init() throws GateException { if (DEBUG) { System.out.println("Entering SVMLightWrapper.init()"); } //see if we can shout about what we're doing sListener = null; java.util.Map listeners = gate.gui.MainFrame.getListeners(); if (listeners != null) { sListener = (gate.event.StatusListener) listeners.get("gate.event.StatusListener"); } if (sListener != null) { sListener.statusChanged("Setting classifier options..."); } extractAndCheckOptions(); if (sListener != null) { sListener.statusChanged("Checking dataset definition..."); } checkDatasetDefinition(); // N.B. We don't initialise the classifier here, because svmLight classifiers, // are both initialised and trained at the same time. Hence initialisation // takes place in the method classifyInstance. //initialise the dataset if (sListener != null) { sListener.statusChanged("Initialising dataset..."); } trainingData = new java.util.ArrayList(); createNominalValue2IntegerHash(); // Create the working directory in which temporary files can be stored. if (sListener != null) { sListener.statusChanged("Creating temporary files..."); } try { // Now create file objects for each of the four files we need. // Java will put these files in the default temporary files directory. trainingDataFile = java.io.File.createTempFile("train", null); testDataFile = java.io.File.createTempFile("test", null); modelFile = java.io.File.createTempFile("model", null); resultsFile = java.io.File.createTempFile("results", null); // We want all of these files to be deleted when the java virtual machine // exits. They will also be deleted if the processing resource is deleted // by factory. trainingDataFile.deleteOnExit(); testDataFile.deleteOnExit(); modelFile.deleteOnExit(); resultsFile.deleteOnExit(); } catch (java.io.IOException exception) { throw new gate.creole.ResourceInstantiationException( "Unable to create temporary files needed for SVMLightWrapper."); } if (sListener != null) { sListener.statusChanged(""); } } // init /** * Create a hash so that nominal values can quickly be mapped * to their feature numbers as used by SVM Light. */ private void createNominalValue2IntegerHash() { nominalValue2IntegerHash = new java.util.HashMap(); java.util.Iterator attributeIterator = datasetDefinition.getAttributes().iterator(); // number the attribtues from 1. int attributeNumber = 0; while (attributeIterator.hasNext()) { // Get the list of feature values for this attribute. Attribute currentAttribute = (Attribute) attributeIterator.next(); // Only do the indexing for nominal attributes. if (currentAttribute.semanticType() == Attribute.NOMINAL) { java.util.List features = currentAttribute.getValues(); java.util.Iterator featureIterator = features.iterator(); int featureValueNumber = 1; // Number the feature values from 1. while (featureIterator.hasNext()) { // Add keys of the form attributeNumber:attributeFeatureValue pointing // to the index of the particular feature value. nominalValue2IntegerHash.put("" + attributeNumber + ":" + featureIterator.next() , new Integer(featureValueNumber)); ++featureValueNumber; } } ++attributeNumber; } } // createNominalValue2IntegerHash /** * Gets the list of actions that can be performed on this resource. * @return a List of Action objects (or null values) */ public java.util.List getActions() { return actionsList; } /** * Registers the PR using the engine with the engine itself. * @param pr the processing resource that owns this engine. */ public void setOwnerPR(gate.ProcessingResource pr) { this.owner = pr; } public DatasetDefintion getDatasetDefinition() { return datasetDefinition; } /** * This class adds the option to the context menu in the GUI that allows the * user to load a dataset which is in SVM Light's own format from a file. */ protected class LoadDatasetAction extends javax.swing.AbstractAction { public LoadDatasetAction() { super("Load data from SVM Light format file"); putValue(SHORT_DESCRIPTION, "Loads training data from a file in SVM Light format and " + "appends it to the current dataset."); } /** * This is the funtion called when the user selects the menu option * load dataset. * * @param evt */ public void actionPerformed(java.awt.event.ActionEvent evt) { Runnable runnable = new Runnable() { public void run() { javax.swing.JFileChooser fileChooser = gate.gui.MainFrame.getFileChooser(); fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter()); fileChooser.setFileSelectionMode(javax.swing.JFileChooser.FILES_ONLY); fileChooser.setMultiSelectionEnabled(false); if (fileChooser.showOpenDialog(null) == javax.swing.JFileChooser.APPROVE_OPTION) { java.io.File file = fileChooser.getSelectedFile(); try { gate.gui.MainFrame.lockGUI("Loading dataset..."); java.io.FileReader reader = new java.io.FileReader(file.getCanonicalPath()); loadDataset(reader); reader.close(); } catch (Exception e) { javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + e.toString(), "Gate", javax.swing.JOptionPane.ERROR_MESSAGE); e.printStackTrace(gate.util.Err.getPrintWriter()); } finally { gate.gui.MainFrame.unlockGUI(); } } } }; Thread thread = new Thread(runnable, "DatasetSaver(SVM Light format)"); thread.setPriority(Thread.MIN_PRIORITY); thread.start(); } } protected class SaveDatasetAction extends javax.swing.AbstractAction { public SaveDatasetAction() { super("Save dataset in SVM Light format"); putValue(SHORT_DESCRIPTION, "Saves the dataset to a file in SVM Light format"); } public void actionPerformed(java.awt.event.ActionEvent evt) { Runnable runnable = new Runnable() { public void run() { javax.swing.JFileChooser fileChooser = gate.gui.MainFrame.getFileChooser(); fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter()); fileChooser.setFileSelectionMode(javax.swing.JFileChooser.FILES_ONLY); fileChooser.setMultiSelectionEnabled(false); if (fileChooser.showSaveDialog(null) == javax.swing.JFileChooser.APPROVE_OPTION) { java.io.File file = fileChooser.getSelectedFile(); try { gate.gui.MainFrame.lockGUI("Saving dataset..."); java.io.FileWriter fw = new java.io.FileWriter(file.getCanonicalPath(), false); saveDataset(fw, trainingData); fw.close(); } catch (java.io.IOException ioe) { javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + ioe.toString(), "Gate", javax.swing.JOptionPane.ERROR_MESSAGE); ioe.printStackTrace(gate.util.Err.getPrintWriter()); } finally { gate.gui.MainFrame.unlockGUI(); } } } }; Thread thread = new Thread(runnable, "DatasetSaver(SVM Light format)"); thread.setPriority(Thread.MIN_PRIORITY); thread.start(); } } /** * Write the data set to a file in SVM Light format. * * @param writer An open file writer to which the data is to be written. * @param dataSet The data set to be saved, in the form of a list of * attributes in the form passed from the ML PR. */ public void saveDataset(java.io.FileWriter writer, java.util.List dataSet) { try { java.util.Iterator trainingDataIterator = dataSet.iterator(); while (trainingDataIterator.hasNext()) { // Create a new SVM Light document for each instance, and convert it to // a String. writer.write(new SVMLightDocument(datasetDefinition, nominalValue2IntegerHash, (java.util.List) trainingDataIterator.next()).toString()); } writer.flush(); } catch (java.io.IOException ioe) { throw new gate.util.GateRuntimeException(ioe.getMessage()); } // Just change the kind of exception to the correct one. catch (gate.creole.ExecutionException ex) { throw new gate.util.GateRuntimeException(ex); } } /** * Reads training data in SVM Light format from a file and adds it to the * collection of training examples. * * @param reader A file reader from which to read the data. */ public void loadDataset(java.io.FileReader reader) throws gate.util.GateRuntimeException, java.io.IOException { MyStringReader inputFile=new MyStringReader(reader); inputFile.skipLeadingComments(); while (inputFile.endOfFileReached()==false) { readAndAddInstance(inputFile); } } private void readAndAddInstance(MyStringReader inputFile) throws gate.util.GateRuntimeException { // First check if there's no more data to read, and if so just return. inputFile.skipBlankLinesAndWhiteSpace(); if (inputFile.endOfFileReached()) return; String classValue=inputFile.readItem(); List featureNumbers=new java.util.ArrayList(); List featureValues=new java.util.ArrayList(); while (true) { // Read in a feature value pair, but if there aren't any more exit the // loop. FeatureValuePair featureValuePair= inputFile.readFeatureValuePair(); if (featureValuePair==null) break; featureNumbers.add(new Integer(featureValuePair.featureNumber)); featureValues.add(new Double(featureValuePair.featureValue)); } // Convert the data we've read into the form required, and add it to the // training data. if (DEBUG) { System.out.println("Adding an instance with class value="+classValue +" and feature numbers:"+featureNumbers +" and feature values: "+featureValues); System.out.println("There are "+featureNumbers.size()+" feature numbers"+ " and "+featureValues.size()+" feature values."); } addInstance(classValue, featureNumbers, featureValues); } /** * Take data read in from an SVM Light format file, and convert it into data * in the format passed from GATE. Then add it as a new training instance * to the training data. * * @param classValue The value of the class attribute in SVM Light format. * @param featureNumbers A list of feature numbers, as in SVM Light format * files. * @param featureValues A list of feature values, corresponding to the feature * numbers list, as they appear in an SVM Light format file. */ private void addInstance(String classValue, List featureNumbers, List featureValues) { java.util.List attributeValues = new java.util.ArrayList(); // We might get to the end of the list, and we don't want to get an error, // when we try to access an item that doesn't exist, so add a fake entry to // the end of the list with a feature number so high that we will never // get past it. featureNumbers.add(new Integer(Integer.MAX_VALUE)); // The value that the double is set to is completely arbitrary, so long as // it's not zero. featureValues.add(new Double(1.0)); int indexInFeatureNumbersList=0; // Each GATE attribute might map to many SVM Light attributes, so we need to // keep track of what SVM Light attribute we're up to separately. SVM Light // attributes are numbered from 1. int firstIndexOfSVMLightAttributeForCurrentGATEAttribute=1; for (int attributeIndex=0; attributeIndex<datasetDefinition.getAttributes().size(); ++attributeIndex) { // First make sure that we're at the right place in the list of SVM Light // features by skipping over any features with value zero (that is the // default value, so we don't need to have it specified), and over any // features that have (or should have) already been used, that is ones // that specify values for GATE attributes that we've already processed. while (((Integer)featureNumbers.get(indexInFeatureNumbersList)).intValue() <firstIndexOfSVMLightAttributeForCurrentGATEAttribute || ((Double)featureValues .get(indexInFeatureNumbersList)).doubleValue()==0.0) ++indexInFeatureNumbersList; // If we've got to the class attribute, add it in. if (attributeIndex==datasetDefinition.getClassIndex()) attributeValues.add( getGATEClassAttributeValue(attributeIndex, classValue)); else { // If we've got to a regular attribute, add that in. // If the feature is in the list, move on the index in the list of // regular attributes. attributeValues.add(getGATEAttributeValue(attributeIndex, ((Double)featureValues.get(indexInFeatureNumbersList)).doubleValue(), firstIndexOfSVMLightAttributeForCurrentGATEAttribute, ((Integer)featureNumbers.get(indexInFeatureNumbersList)).intValue())); firstIndexOfSVMLightAttributeForCurrentGATEAttribute+= numberOfSVMLightAttributesUsedForGATEAttribute(attributeIndex); } } // Finally actually add the list of attribute values we've just made to the // training data. addTrainingInstance(attributeValues); } /** * Find the value for a GATE class attribute, based on the information read in * from an SVM Light data file. * * Note that we must unweight any attributes that have been weighted here. It * is possible to tell the unweighted value of boolean and nominal attributes * just by checking their signs, which is what is done here. * * @param gateAttributeIndex The index of the attribute in datasetDefinition * @param classValue The value read in for the class value from a file in SVM * Light format. * @return The value of the class attribute in the format used by GATE. */ private String getGATEClassAttributeValue( int gateAttributeIndex, String classValue) { Attribute classAttribute=datasetDefinition.getClassAttribute(); // Class attributes can be either boolean, or two or three valued nominals. if (classAttribute.semanticType()==Attribute.BOOLEAN) { if (Double.parseDouble(classValue) > 0.0) return "true"; else if (Double.parseDouble(classValue) < 0.0/classAttribute.getWeighting()) return "false"; else throw new gate.util.GateRuntimeException("Value of class attribute "+ "in a file in SVM Light format is zero, but this is "+ "not allowed because the gate attribute used \nto represent this value"+ " is boolean. The value of the attribute \nis \""+classValue+"\"."); } // Next deal with two value nominals. else if (classAttribute.semanticType()==Attribute.NOMINAL) { if (classAttribute.getValues().size()==2) { if (Double.parseDouble(classValue) > 0.0) return (String)classAttribute.getValues().get(0); else if (Double.parseDouble(classValue) < 0.0) return (String)classAttribute.getValues().get(1); else throw new gate.util.GateRuntimeException("Value of class attribute "+ "in a file in SVM Light format \nis zero, but this is "+ "not allowed because the gate attribute \nused to represent this value"+ " is a two value nominal attribute. \nThe value of the attribute is" +" \""+classValue+"\"."); } // Finally deal with three value nominals, which allow a third value, // signifying that the class value is unknown and that transduction should // be used. else if (classAttribute.getValues().size()==3) { if (Double.parseDouble(classValue) > 0.0/classAttribute.getWeighting()) return (String)classAttribute.getValues().get(0); else if (Double.parseDouble(classValue) < 0.0/classAttribute.getWeighting()) return (String)classAttribute.getValues().get(1); else if (Double.parseDouble(classValue)==0) return (String)classAttribute.getValues().get(2); } } // If we haven't returned by this point, it must be because the class value // is of the wrong type. throw new gate.util.GateRuntimeException("The class value specified in "+ " configuration file is not "+ " boolean or a \ntwo or three "+ "valued nominal, as it is "+ "required to be."); } /** * Find the value for a GATE attribute, based on the information read in from * an SVM Light data file. This method unweights any values that were weighted * when the file was created. For boolean and nominal attributes this means * that it just looks at the sign of the attribute, as that is sufficient * for determining the unweighted value of the attribute, which will always * be +1, -1 or 0. * * @param gateAttributeIndex The index of the attribute in datasetDefinition * @param svmFeatureValue The value of the feature in the SVM Light data file * @param firstIndexOfSVMLightAttributeForCurrentGATEAttribute The number of * the first SVM Light parameter that encodes values for the GATE attribute. * @param svmFeatureNumber The number of the SVM feature that might correspond * to the the current GATE attribtue. (The function will check to see if it * does correspond, if not it is ignored.) * @return The value of the attribute in the format used by GATE */ private String getGATEAttributeValue(int gateAttributeIndex, double svmFeatureValue, int firstIndexOfSVMLightAttributeForCurrentGATEAttribute, int svmFeatureNumber) { Attribute gateAttribute= (Attribute)datasetDefinition.getAttributes().get(gateAttributeIndex); // Numeric attributes are straightforward. Each one maps to a single SVM // Light attribute, and their values are unchanged. If they're missing it // just indicates that their value is zero. if (gateAttribute.semanticType()==Attribute.NUMERIC) { if (firstIndexOfSVMLightAttributeForCurrentGATEAttribute ==svmFeatureNumber) return ""+(svmFeatureValue/gateAttribute.getWeighting()); else return "0"; } // Boolean attributes also map to just one SVM Light attribute. We just need // to change their values from +1 or -1 to true or false. else if (gateAttribute.semanticType()==Attribute.BOOLEAN) { if (firstIndexOfSVMLightAttributeForCurrentGATEAttribute ==svmFeatureNumber) { if (svmFeatureValue > 0.0) return "true"; if (svmFeatureValue < 0.0) return "false"; // Those are the only allowable values for a boolean attribute, so if we // have got one there's an error in the input data set. throw new gate.util.GateRuntimeException( "Error when loading an SVM Light" + " format file. The feature-valu" + "e pair " + svmFeatureNumber + ":" + svmFeatureValue + " \nshould be boolean" + ", but it's value is zero."); } throw new gate.util.GateRuntimeException( "Error when loading an SVM Light format file. A boolean value is not " +"\nspecified, \nand this is not allowed because boolean values must take" +" \neither +1 or -1 as their \nvalues, and so the default value of 0 can" +" \nnot be assigned to them."); } // Otherwise we must have a nominal attribute. else { // If the current svm light attribute speficies a value for the current // gate attribute. if (svmFeatureNumber >= firstIndexOfSVMLightAttributeForCurrentGATEAttribute && svmFeatureNumber < firstIndexOfSVMLightAttributeForCurrentGATEAttribute+ numberOfSVMLightAttributesUsedForGATEAttribute(gateAttributeIndex)) { // The only valid values for a nominal svm Light feature are 0 or 1 // (or their weighted equivalents), // but as we've already skipped any attributes that have the value 0, // the feature value must be 1, or its weighted equivalent. // The value of the nominal attribute will correspond to which slot it // fills, i.e. which svm light feature is set to 1. return (String)gateAttribute.getValues().get( svmFeatureNumber-firstIndexOfSVMLightAttributeForCurrentGATEAttribute); } // If the attribute doesn't specify a value for the gate attribute (in // which case is should be a value for a later attribute) then the value // of the current attribute must be unspecified. else { return ""; } } } /** * Return the number of SVM light attributes that are used to represent a * GATE attribute. * @param gateAttributeIndex The index in the dataset definition of the GATE * attribute * @return The number of SVM light attributes used to represent the GATE * attribute. */ private int numberOfSVMLightAttributesUsedForGATEAttribute( int gateAttributeIndex) { Attribute gateAttribute= (Attribute)datasetDefinition.getAttributes().get(gateAttributeIndex); // Each boolean or numeric gateAttribute just maps to a single SVM Light // attribute. if (gateAttribute.semanticType()==Attribute.BOOLEAN || gateAttribute.semanticType()==Attribute.NUMERIC) return 1; // If we get here, we must have a nominal attribute, for which each possible // value is represented by a different SVM Light attribute. return gateAttribute.getValues().size(); } /** * This allows the model, including its parameters to be saved to a file. */ protected class SaveModelAction extends javax.swing.AbstractAction { public SaveModelAction() { super("Save model"); putValue(SHORT_DESCRIPTION, "Saves the ML model to a file"); } /** * This function will open a file chooser, and then call the save function * to actually save the model. (It is not normally called directly by the * user, but will be called as the result of the save model menu option * being selected.) */ public void actionPerformed(java.awt.event.ActionEvent evt) { Runnable runnable = new Runnable() { public void run() { javax.swing.JFileChooser fileChooser = gate.gui.MainFrame.getFileChooser(); fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter()); fileChooser.setFileSelectionMode(javax.swing.JFileChooser.FILES_ONLY); fileChooser.setMultiSelectionEnabled(false); if (fileChooser.showSaveDialog(null) == javax.swing.JFileChooser.APPROVE_OPTION) { java.io.File file = fileChooser.getSelectedFile(); try { gate.gui.MainFrame.lockGUI("Saving ML model..."); saveModel(file); } catch (java.io.IOException ioe) { javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + ioe.toString(), "Gate", javax.swing.JOptionPane.ERROR_MESSAGE); ioe.printStackTrace(gate.util.Err.getPrintWriter()); } finally { gate.gui.MainFrame.unlockGUI(); } } } }; Thread thread = new Thread(runnable, "ModelSaver(serialisation)"); thread.setPriority(Thread.MIN_PRIORITY); thread.start(); } } /** * This reloads a file that was previously saved using the SaveModelAction * class. */ protected class LoadModelAction extends javax.swing.AbstractAction { public LoadModelAction() { super("Load model"); putValue(SHORT_DESCRIPTION, "Loads a ML model from a file"); } /** * This function will open a file chooser, and then call the load function * to actually load the model. (It is not normally called directly by the * user, but will be called as the result of the load model menu option * being selected.) */ public void actionPerformed(java.awt.event.ActionEvent evt) { Runnable runnable = new Runnable() { public void run() { javax.swing.JFileChooser fileChooser = gate.gui.MainFrame.getFileChooser(); fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter()); fileChooser.setFileSelectionMode(javax.swing.JFileChooser.FILES_ONLY); fileChooser.setMultiSelectionEnabled(false); if (fileChooser.showOpenDialog(null) == javax.swing.JFileChooser.APPROVE_OPTION) { java.io.File file = fileChooser.getSelectedFile(); try { gate.gui.MainFrame.lockGUI("Loading model..."); loadModel(file); } catch (java.io.IOException ioe) { javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + ioe.toString(), "Gate", javax.swing.JOptionPane.ERROR_MESSAGE); ioe.printStackTrace(gate.util.Err.getPrintWriter()); } finally { gate.gui.MainFrame.unlockGUI(); } } } }; Thread thread = new Thread(runnable, "ModelLoader(serialisation)"); thread.setPriority(Thread.MIN_PRIORITY); thread.start(); } } /** * This copies a model file saved on disk to the temp file used by the * wrapper to store models. This effectively 'loads' the model. * * @param filename The name of the file to copy. */ private void copyModelToTempFile(java.lang.String filename) throws java.io.IOException { java.io.File source = new java.io.File(filename); copyFile(source, modelFile); } /** * This copies a model file from the temp file used by the wrapper to store * models. This effectively 'saves' the model. * * @param filename The name of the destination file for the copy. */ private void copyModelFromTempFile(java.lang.String filename) throws java.io.IOException { java.io.File destination = new java.io.File(filename); copyFile(modelFile, destination); } /** * Copy a file on disk from one location to another. N.B. This is implemented * by loading and then saving the file, as otherwise a system dependent * external command would have to be used. * * @param source The file to copy. * @param destination The location of the file to be copied. */ private void copyFile(java.io.File source, java.io.File destination) throws java.io.IOException { if (!destination.exists()) destination.createNewFile(); java.io.BufferedInputStream inputFromFile = new java.io.BufferedInputStream(new java.io.FileInputStream(source)); java.io.BufferedOutputStream outputToFile = new java.io.BufferedOutputStream(new java.io.FileOutputStream(destination)); int i = inputFromFile.read(); while (i != -1) { outputToFile.write(i); i = inputFromFile.read(); } outputToFile.close(); inputFromFile.close(); } /** * Loads the state of this engine from previously saved data. * @param An open InputStream from which the model will be loaded. */ public void load(java.io.InputStream is) throws java.io.IOException { if (sListener != null) { sListener.statusChanged("Loading java part of model..."); } java.io.ObjectInputStream ois = new java.io.ObjectInputStream(is); try { // svmLightModel = ois.readInt(); trainingData = (java.util.List) ois.readObject(); datasetDefinition = (DatasetDefintion) ois.readObject(); datasetChanged = ois.readBoolean(); modelTrained = ois.readBoolean(); classifierOptions = (String) ois.readObject(); } catch (ClassNotFoundException cnfe) { throw new gate.util.GateRuntimeException(cnfe.toString()); } ois.close(); if (sListener != null) { sListener.statusChanged(""); } } /** * Saves the state of the engine for reuse at a later time. optionsElement is * not saved so as to make this code consistent with wekaWrapper. * @param An open output stream to which the model will be saved. */ public void save(java.io.OutputStream os) throws java.io.IOException { if (sListener != null) { sListener.statusChanged("Saving java part of model..."); } java.io.ObjectOutputStream oos = new java.io.ObjectOutputStream(os); //oos.writeInt(svmLightModel); oos.writeObject(trainingData); oos.writeObject(datasetDefinition); oos.writeBoolean(datasetChanged); oos.writeBoolean(modelTrained); oos.writeObject(classifierOptions); oos.flush(); oos.close(); if (sListener != null) { sListener.statusChanged(""); } } /** * Load a previously saved state of the engine. If the saved state includes * an up-to-date trained model, this is also reloaded. * * @param file the file from which the state is to be loaded. If the state * indicates that a trained model should be loaded, this should be in * <i>file</i>.NativePart. */ public void loadModel(File file) throws IOException { load(new java.util.zip.GZIPInputStream( new java.io.FileInputStream(file))); // If an up to date svm_model was saved when the model was saved, // load that back in too, from a separate file, // with the same name but with .NativePart appended. As we want // to end up with the model in a file, we don't need to actually // load it, instead we just copy the file to the modelFile. if (datasetChanged==false && modelTrained) { copyModelToTempFile(file.getAbsolutePath() + ".NativePart"); } } /** * Saves the state of the engine for reuse at a later time. optionsElement is * not saved so as to make this code consistent with wekaWrapper. If an * up-to-date trained model exists, it will be saved in * <i>file</i>.NativePart. */ public void saveModel(File file) throws IOException { save(new java.util.zip.GZIPOutputStream( new java.io.FileOutputStream( file.getCanonicalPath(), false))); // If we've got an up to date model trained, then save that too, // in a file with the same name but with .NativePart appended. // N.B. As models are always stored on the disk anyway, this // really just involves copying a file to the location given // by the user. if (datasetChanged==false && modelTrained) copyModelFromTempFile(file.getCanonicalPath()+".NativePart"); } /** * Has the dataset changed since the model was last trained? */ public boolean isDatasetChanged() { return datasetChanged; } /** * Is there a trained model available (whether or not it is up to date)? */ public boolean isModelTrained() { return modelTrained; } public boolean supportsBatchMode(){ return true; } protected java.util.HashMap nominalValue2IntegerHash; protected gate.creole.ml.DatasetDefintion datasetDefinition; /** * This List stores all the data that has been collected. Each item is a * List of objects, each of which is an attribute (and one of which is the * class attribute). */ protected List trainingData; /** * The JDom element contaning the options fro this wrapper. */ protected org.jdom.Element optionsElement; /** * Marks whether the dataset was changed since the last time the classifier * was built. */ protected boolean datasetChanged = false; /** * Marks whether in the present state a trained model exists (whether or not * it is up to date) */ protected boolean modelTrained = false; /** * These file objects store the path names to the files that will be used * to store the model, data and results while they are passed to and from * svm light. */ protected java.io.File trainingDataFile; protected java.io.File testDataFile; protected java.io.File modelFile; protected java.io.File resultsFile; /* * This list stores the actions that will be available on the context menu * in the GUI. */ protected List actionsList; protected gate.ProcessingResource owner; protected gate.event.StatusListener sListener; /** * The following parameter is set by the <CLASSIFIER-OPTIONS> element * of the config file, with the <OPTIONS> element. It specifies the * options in the same format as is required when they are specified on the * command line. */ java.lang.String classifierOptions; } // SVMLightWrapper