/* * WekaLearning.java * * Yaoyong Li 22/03/2007 * * $Id: WekaLearning.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $ */ package gate.learning.learners.weka; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import weka.core.Attribute; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.SparseInstance; import gate.learning.ConstantParameters; import gate.learning.LabelsOfFV; import gate.learning.LabelsOfFeatureVectorDoc; import gate.learning.NLPFeaturesList; import gate.learning.SparseFeatureVector; import gate.learning.learners.MultiClassLearning; /** * The interface between the Weka learner and the data defined in the ML Api, * which convert the data into the format a Weka learner can use. */ public class WekaLearning { /** The data in the Weka object for training or application. */ public Instances instancesData; /** The labels in the form of instances of every doc. */ public LabelsOfFeatureVectorDoc[] labelsFVDoc = null; /** For using the feature vector data. */ public final static short SPARSEFVDATA = 2; /** For using the NLP feature data. */ public final static short NLPFEATUREFVDATA = 1; /** Learn a model and save it into the model file. */ public void train(WekaLearner wekaCl, File modelFile) { // Training. wekaCl.training(instancesData); // Write the learner class into the modelfile by class serialisation try { if(modelFile.exists()) { deleteRecursively(modelFile); } FileOutputStream modelOutFile = new FileOutputStream(modelFile); ObjectOutputStream modelOutputObjectFile = new ObjectOutputStream( modelOutFile); modelOutputObjectFile.writeObject(wekaCl); modelOutputObjectFile.flush(); modelOutputObjectFile.close(); } catch(FileNotFoundException e) { e.printStackTrace(); } catch(IOException e) { e.printStackTrace(); } } /** * Recursively delete a file or directory (think "rm -rf file"). */ private void deleteRecursively(File file) throws IOException { if(!file.exists()) { return; } if(file.isDirectory()) { for(File f : file.listFiles()) { deleteRecursively(f); } } if(!file.delete()) { throw new IOException("Couldn't delete file " + file); } } /** Read the model from the file and apply it to the data. */ public void apply(WekaLearner wekaCl, File modelFile, boolean distributionOutput) { // Read the learner class from the modelfile by class serialisation try { FileInputStream modelInFile = new FileInputStream(modelFile); ObjectInputStream modelInputObjectFile = new ObjectInputStream( modelInFile); wekaCl = (WekaLearner)modelInputObjectFile.readObject(); modelInputObjectFile.close(); } catch(FileNotFoundException e) { e.printStackTrace(); } catch(IOException e) { e.printStackTrace(); } catch(ClassNotFoundException e) { e.printStackTrace(); } // Apply the model to the data. wekaCl.applying(instancesData, labelsFVDoc, distributionOutput); } /** * Read the sparse feature vector data from the data file and convert it into * the Weka's instance format. */ public void readSparseFVsFromFile(File dataFile, int numDocs, boolean trainingMode, int numLabels, boolean surroundMode) { int numFeats = 0; int numClasses = 0; labelsFVDoc = new LabelsOfFeatureVectorDoc[numDocs]; // Read the sparse FVs by using the method in MultiClassLearning class MultiClassLearning multiClassL = new MultiClassLearning(); boolean isUsingDataFile = false; File tempFVDataFile = null; multiClassL.getDataFromFile(numDocs, dataFile, isUsingDataFile, tempFVDataFile); // Create the attributes. numFeats = multiClassL.dataFVinDoc.getTotalNumFeatures(); FastVector attributes = new FastVector(numFeats + 1); for(int i = 0; i < numFeats; ++i) attributes.addElement(new Attribute(new Integer(i + 1).toString())); // Add class attribute. if(surroundMode) numClasses = 2 * numLabels + 1; // count the null too, as value -1. else numClasses = numLabels + 1; FastVector classValues = new FastVector(numClasses); classValues.addElement("-1"); // The first class for null class for(int i = 1; i < numClasses; ++i) classValues.addElement(new Integer(i).toString()); attributes.addElement(new Attribute("Class", classValues)); // Create the dataset with capacity of all FVs (but actuall number of FVs // mabe be larger than the pre-specified, because possible multi-label) and // set index of class instancesData = new Instances("SparseFVsData", attributes, multiClassL.dataFVinDoc.getNumTraining()); instancesData.setClassIndex(instancesData.numAttributes() - 1); // Copy the data into the instance; for(int iDoc = 0; iDoc < multiClassL.dataFVinDoc.getNumTrainingDocs(); ++iDoc) { SparseFeatureVector[] fvs = multiClassL.dataFVinDoc.trainingFVinDoc[iDoc] .getFvs(); labelsFVDoc[iDoc] = new LabelsOfFeatureVectorDoc(); labelsFVDoc[iDoc].multiLabels = multiClassL.dataFVinDoc.labelsFVDoc[iDoc].multiLabels; for(int i = 0; i < fvs.length; ++i) { // Object valueO = fvs[i].getValues(); double[] values = new double[fvs[i].getLen()]; int[] indexes = new int[fvs[i].getLen()]; for(int j = 0; j < fvs[i].getLen(); ++j) { //values[j] = (double)fvs[i].values[j]; values[j] = fvs[i].nodes[j].value; indexes[j] = fvs[i].nodes[j].index; } SparseInstance inst = new SparseInstance(1.0, values, indexes, 50000); inst.setDataset(instancesData); if(trainingMode && labelsFVDoc[iDoc].multiLabels[i].num > 0) for(int j1 = 0; j1 < labelsFVDoc[iDoc].multiLabels[i].num; ++j1) { inst.setClassValue((labelsFVDoc[iDoc].multiLabels[i].labels[j1])); // label // >0 instancesData.add(inst); } else { inst.setClassValue("-1"); // set label as -1 for null instancesData.add(inst); } } } return; } /** * Read the NLP feature data from the data file and convert it into the Weka's * instance format. */ public void readNLPFeaturesFromFile(File dataFile, int numDocs, NLPFeaturesList nlpFeatList, boolean trainingMode, int numLabels, boolean surroundMode) { labelsFVDoc = new LabelsOfFeatureVectorDoc[numDocs]; try { BufferedReader inData; inData = new BufferedReader(new InputStreamReader(new FileInputStream( dataFile), "UTF-8")); // Get the number of attributes in the data String[] items = inData.readLine() .split(ConstantParameters.ITEMSEPARATOR); HashMap metaFeats = new HashMap(); int numFeats = 0; // Create an attribute for each meta feature HashMap entityToPosition = new HashMap(); String entityTerm = ""; int numEntity = 0; // Not include the class attribute for(int i = 1; i < items.length; ++i) { // Assume the name of NGRAM should end with "gram"!! if(!items[i].endsWith("gram")) { if(!metaFeats.containsKey(items[i])) { metaFeats.put(items[i], new HashSet()); ++numFeats; // counted as a new attribute } String feat = items[i].substring(0, items[i].lastIndexOf("(")); String featNum = items[i].substring(items[i].lastIndexOf("(")); if(!feat.equals(entityTerm)) { numEntity = 0; entityTerm = feat; } else ++numEntity; entityToPosition.put(feat + "_" + numEntity, featNum); if(!metaFeats.containsKey(feat)) { metaFeats.put(feat, new HashSet()); // just for collect the terms } } } List allTerms = new ArrayList(nlpFeatList.featuresList.keySet()); Collections.sort(allTerms); for(int i = 0; i < allTerms.size(); ++i) { String feat = allTerms.get(i).toString(); if(isNgramFeat(feat)) { ++numFeats; } else { feat = feat.substring(feat.indexOf("_") + 1); // Name of the entity String feat1 = feat.substring(0, feat.indexOf("_")); // Term itself String feat2 = feat.substring(feat.indexOf("_") + 1); ((HashSet)metaFeats.get(feat1)).add(feat2); } } numFeats += 1; // include the class feature // Create the attributes. HashMap featToAttr = new HashMap(); // feat to attribute index FastVector attributes = new FastVector(numFeats); // First for the meta feature attribute. List metaFeatTerms = new ArrayList(metaFeats.keySet()); int numMetaFeats = 0; for(int i = 0; i < metaFeatTerms.size(); ++i) { String featName = metaFeatTerms.get(i).toString(); if(featName.endsWith(")")) { String featName0 = featName.substring(0, featName.lastIndexOf("(")); HashSet metaF = (HashSet)metaFeats.get(featName0); FastVector featFV = new FastVector(metaF.size()); for(Object obj : metaF) featFV.addElement(obj.toString()); attributes.addElement(new Attribute(featName, featFV)); featToAttr.put(featName, new Integer(numMetaFeats)); ++numMetaFeats; } } // Then the terms from ngram features for(int i = 0; i < allTerms.size(); ++i) { String feat = allTerms.get(i).toString(); if(isNgramFeat(feat)) { FastVector featFV = new FastVector(1); featFV.addElement(feat); attributes.addElement(new Attribute(feat, featFV));// Nominal form featToAttr.put(feat, new Integer(i + numMetaFeats)); } } // Add class attribute. int numClasses; if(surroundMode) numClasses = 2 * numLabels + 1; // count the null too, as value -1. else numClasses = numLabels + 1; FastVector classValues = new FastVector(numClasses); classValues.addElement("-1"); // The first class for null class for(int i = 1; i < numClasses; ++i) classValues.addElement(new Integer(i).toString()); attributes.addElement(new Attribute("Class", classValues)); // Create the dataset with capacity of all FVs, and set index of class instancesData = new Instances("NLPFeatureData", attributes, numDocs * 10); // The first attribute is for class. instancesData.setClassIndex(attributes.size() - 1); // Read data from file and copy the data into the instance; for(int iDoc = 0; iDoc < numDocs; ++iDoc) { // For each document items = inData.readLine().split(ConstantParameters.ITEMSEPARATOR); // The third item is for number of instances in the doc. int num = Integer.parseInt(items[2]); labelsFVDoc[iDoc] = new LabelsOfFeatureVectorDoc(); labelsFVDoc[iDoc].multiLabels = new LabelsOfFV[num]; for(int i = 0; i < num; ++i) { // For each instance items = inData.readLine().split(ConstantParameters.ITEMSEPARATOR); Instance inst = new Instance(numFeats); inst.setDataset(instancesData); int numLabel = Integer.parseInt(items[0]); // number of labels for // the instance entityTerm = ""; numEntity = 0; // For each NLP feature term for(int j = numLabel + 1; j < items.length; ++j) { // Skip the feature if it is not in the list if(!allTerms.contains(items[j])) continue; if(isNgramFeat(items[j])) {// if it's a ngram items[j] = items[j].substring(0, items[j] .lastIndexOf(NLPFeaturesList.SYMBOLNGARM)); inst.setValue(Integer.parseInt(featToAttr.get(items[j]) .toString()), items[j]); } else {// if not a ngram // For real features, not "_NA" if(!items[j].equals(ConstantParameters.NAMENONFEATURE)) { // Get the feature term items[j] = items[j].substring(items[j].indexOf("_") + 1); // Entity name String feat1 = items[j].substring(0, items[j].indexOf("_")); // Feature name String feat2 = items[j].substring(items[j].indexOf("_") + 1); if(!feat1.equals(entityTerm)) { numEntity = 0; entityTerm = feat1; } else ++numEntity; feat1 = feat1 + entityToPosition.get(feat1 + "_" + numEntity).toString(); inst.setValue(Integer .parseInt(featToAttr.get(feat1).toString()), feat2); } } } if(trainingMode && numLabel > 0) { labelsFVDoc[iDoc].multiLabels[i] = new LabelsOfFV(numLabel); for(int j = 1; j <= numLabel; ++j) { inst.setClassValue(items[j]); instancesData.add(inst); } } else { labelsFVDoc[iDoc].multiLabels[i] = new LabelsOfFV(0); inst.setClassValue("-1"); // set as null class instancesData.add(inst); } }// end of the loop i } inData.close(); } catch(FileNotFoundException e) { e.printStackTrace(); } catch(IOException e) { e.printStackTrace(); } return; } /** Check if the item is a n-gram or not. */ private boolean isNgramFeat(String item) { if(item.contains(NLPFeaturesList.SYMBOLNGARM)) return true; else return false; } /** * Determining a learner from Weka using the NLP. feature data or the feature * vector data. */ public static short obtainWekaLeanerDataType(String learnerName) { if(learnerName.contains("C4.5") || learnerName.contains("NaiveBayes")) { return NLPFEATUREFVDATA; } else { return SPARSEFVDATA; } } /** Obtaining the Weka learners. */ public static WekaLearner obtainWekaLearner(String learnerName, String learningOpts) { WekaLearner wekaL = null; if(learnerName.contains("KNN")) { if(learningOpts != null) { wekaL = new KNNIBK(learningOpts); } else wekaL = new KNNIBK(); } else if(learnerName.contains("NaiveBayes")) { wekaL = new NaiveBayesC(); } else if(learnerName.contains("C4.5")) { wekaL = new C45(); } if(learningOpts != null) wekaL.getParametersFromOptionsLine(learningOpts); return wekaL; } /** Determing the output type of a Weka learner. */ public static boolean obtainWekaLearnerOutputType(String learnerName) { /* * if(learnerName.contains("KNN")) { return true; } else * if(learnerName.contains("NaiveBayes")) { return true; } else * if(learnerName.contains("C45")) { return true; } */ return true; // return false; } }