GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Learning/src/gate/learning/learners/weka/WekaLearning.java

/*
 *  WekaLearning.java
 * 
 *  Yaoyong Li 22/03/2007
 *
 *  $Id: WekaLearning.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
 */
package gate.learning.learners.weka;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SparseInstance;
import gate.learning.ConstantParameters;
import gate.learning.LabelsOfFV;
import gate.learning.LabelsOfFeatureVectorDoc;
import gate.learning.NLPFeaturesList;
import gate.learning.SparseFeatureVector;
import gate.learning.learners.MultiClassLearning;

/**
 * The interface between the Weka learner and the data defined in the ML Api,
 * which convert the data into the format a Weka learner can use.
 */
public class WekaLearning {
  /** The data in the Weka object for training or application. */
  public Instances instancesData;
  /** The labels in the form of instances of every doc. */
  public LabelsOfFeatureVectorDoc[] labelsFVDoc = null;
  /** For using the feature vector data. */
  public final static short SPARSEFVDATA = 2;
  /** For using the NLP feature data. */
  public final static short NLPFEATUREFVDATA = 1;

  /** Learn a model and save it into the model file. */
  public void train(WekaLearner wekaCl, File modelFile) {
    // Training.
    wekaCl.training(instancesData);
    // Write the learner class into the modelfile by class serialisation
    try {
      if(modelFile.exists()) {
        deleteRecursively(modelFile);
      }
      FileOutputStream modelOutFile = new FileOutputStream(modelFile);
      ObjectOutputStream modelOutputObjectFile = new ObjectOutputStream(
        modelOutFile);
      modelOutputObjectFile.writeObject(wekaCl);
      modelOutputObjectFile.flush();
      modelOutputObjectFile.close();
    } catch(FileNotFoundException e) {
      e.printStackTrace();
    } catch(IOException e) {
      e.printStackTrace();
    }
  }

  /**
   * Recursively delete a file or directory (think "rm -rf file").
   */
  private void deleteRecursively(File file) throws IOException {
    if(!file.exists()) { return; }
    if(file.isDirectory()) {
      for(File f : file.listFiles()) {
        deleteRecursively(f);
      }
    }
    if(!file.delete()) { throw new IOException("Couldn't delete file " + file); }
  }

  /** Read the model from the file and apply it to the data. */
  public void apply(WekaLearner wekaCl, File modelFile,
    boolean distributionOutput) {
    // Read the learner class from the modelfile by class serialisation
    try {
      FileInputStream modelInFile = new FileInputStream(modelFile);
      ObjectInputStream modelInputObjectFile = new ObjectInputStream(
        modelInFile);
      wekaCl = (WekaLearner)modelInputObjectFile.readObject();
      modelInputObjectFile.close();
    } catch(FileNotFoundException e) {
      e.printStackTrace();
    } catch(IOException e) {
      e.printStackTrace();
    } catch(ClassNotFoundException e) {
      e.printStackTrace();
    }
    // Apply the model to the data.
    wekaCl.applying(instancesData, labelsFVDoc, distributionOutput);
  }

  /**
   * Read the sparse feature vector data from the data file and convert it into
   * the Weka's instance format.
   */
  public void readSparseFVsFromFile(File dataFile, int numDocs,
    boolean trainingMode, int numLabels, boolean surroundMode) {
    int numFeats = 0;
    int numClasses = 0;
    labelsFVDoc = new LabelsOfFeatureVectorDoc[numDocs];
    // Read the sparse FVs by using the method in MultiClassLearning class
    MultiClassLearning multiClassL = new MultiClassLearning();
    boolean isUsingDataFile = false;
    File tempFVDataFile = null;
    multiClassL.getDataFromFile(numDocs, dataFile, isUsingDataFile, tempFVDataFile);
    // Create the attributes.
    numFeats = multiClassL.dataFVinDoc.getTotalNumFeatures();
    FastVector attributes = new FastVector(numFeats + 1);
    for(int i = 0; i < numFeats; ++i)
      attributes.addElement(new Attribute(new Integer(i + 1).toString()));
    // Add class attribute.
    if(surroundMode)
      numClasses = 2 * numLabels + 1; // count the null too, as value -1.
    else numClasses = numLabels + 1;
    FastVector classValues = new FastVector(numClasses);
    classValues.addElement("-1"); // The first class for null class
    for(int i = 1; i < numClasses; ++i)
      classValues.addElement(new Integer(i).toString());
    attributes.addElement(new Attribute("Class", classValues));
    // Create the dataset with capacity of all FVs (but actuall number of FVs
    // mabe be larger than the pre-specified, because possible multi-label) and
    // set index of class
    instancesData = new Instances("SparseFVsData", attributes,
      multiClassL.dataFVinDoc.getNumTraining());
    instancesData.setClassIndex(instancesData.numAttributes() - 1);
    // Copy the data into the instance;
    for(int iDoc = 0; iDoc < multiClassL.dataFVinDoc.getNumTrainingDocs(); ++iDoc) {
      SparseFeatureVector[] fvs = multiClassL.dataFVinDoc.trainingFVinDoc[iDoc]
        .getFvs();
      labelsFVDoc[iDoc] = new LabelsOfFeatureVectorDoc();
      labelsFVDoc[iDoc].multiLabels = multiClassL.dataFVinDoc.labelsFVDoc[iDoc].multiLabels;
      for(int i = 0; i < fvs.length; ++i) {
        // Object valueO = fvs[i].getValues();
        double[] values = new double[fvs[i].getLen()];
        int[] indexes = new int[fvs[i].getLen()];
        for(int j = 0; j < fvs[i].getLen(); ++j) {
          //values[j] = (double)fvs[i].values[j];
          values[j] = fvs[i].nodes[j].value;
          indexes[j] = fvs[i].nodes[j].index;
        }
        SparseInstance inst = new SparseInstance(1.0, values, indexes, 50000);
        inst.setDataset(instancesData);
        if(trainingMode && labelsFVDoc[iDoc].multiLabels[i].num > 0)
          for(int j1 = 0; j1 < labelsFVDoc[iDoc].multiLabels[i].num; ++j1) {
            inst.setClassValue((labelsFVDoc[iDoc].multiLabels[i].labels[j1])); // label
            // >0
            instancesData.add(inst);
          }
        else {
          inst.setClassValue("-1"); // set label as -1 for null
          instancesData.add(inst);
        }
      }
    }
    return;
  }

  /**
   * Read the NLP feature data from the data file and convert it into the Weka's
   * instance format.
   */
  public void readNLPFeaturesFromFile(File dataFile, int numDocs,
    NLPFeaturesList nlpFeatList, boolean trainingMode, int numLabels,
    boolean surroundMode) {
    labelsFVDoc = new LabelsOfFeatureVectorDoc[numDocs];
    try {
      BufferedReader inData;
      inData = new BufferedReader(new InputStreamReader(new FileInputStream(
        dataFile), "UTF-8"));
      // Get the number of attributes in the data
      String[] items = inData.readLine()
        .split(ConstantParameters.ITEMSEPARATOR);
      HashMap metaFeats = new HashMap();
      int numFeats = 0;
      // Create an attribute for each meta feature
      HashMap entityToPosition = new HashMap();
      String entityTerm = "";
      int numEntity = 0;
      // Not include the class attribute
      for(int i = 1; i < items.length; ++i) {
        // Assume the name of NGRAM should end with "gram"!!
        if(!items[i].endsWith("gram")) {
          if(!metaFeats.containsKey(items[i])) {
            metaFeats.put(items[i], new HashSet());
            ++numFeats; // counted as a new attribute
          }
          String feat = items[i].substring(0, items[i].lastIndexOf("("));
          String featNum = items[i].substring(items[i].lastIndexOf("("));
          if(!feat.equals(entityTerm)) {
            numEntity = 0;
            entityTerm = feat;
          } else ++numEntity;
          entityToPosition.put(feat + "_" + numEntity, featNum);
          if(!metaFeats.containsKey(feat)) {
            metaFeats.put(feat, new HashSet());
            // just for collect the terms
          }
        }
      }
      List allTerms = new ArrayList(nlpFeatList.featuresList.keySet());
      Collections.sort(allTerms);
      for(int i = 0; i < allTerms.size(); ++i) {
        String feat = allTerms.get(i).toString();
        if(isNgramFeat(feat)) {
          ++numFeats;
        } else {
          feat = feat.substring(feat.indexOf("_") + 1);
          // Name of the entity
          String feat1 = feat.substring(0, feat.indexOf("_"));
          // Term itself
          String feat2 = feat.substring(feat.indexOf("_") + 1);
          ((HashSet)metaFeats.get(feat1)).add(feat2);
        }
      }
      numFeats += 1; // include the class feature
      // Create the attributes.
      HashMap featToAttr = new HashMap(); // feat to attribute index
      FastVector attributes = new FastVector(numFeats);
      // First for the meta feature attribute.
      List metaFeatTerms = new ArrayList(metaFeats.keySet());
      int numMetaFeats = 0;
      for(int i = 0; i < metaFeatTerms.size(); ++i) {
        String featName = metaFeatTerms.get(i).toString();
        if(featName.endsWith(")")) {
          String featName0 = featName.substring(0, featName.lastIndexOf("("));
          HashSet metaF = (HashSet)metaFeats.get(featName0);
          FastVector featFV = new FastVector(metaF.size());
          for(Object obj : metaF)
            featFV.addElement(obj.toString());
          attributes.addElement(new Attribute(featName, featFV));
          featToAttr.put(featName, new Integer(numMetaFeats));
          ++numMetaFeats;
        }
      }
      // Then the terms from ngram features
      for(int i = 0; i < allTerms.size(); ++i) {
        String feat = allTerms.get(i).toString();
        if(isNgramFeat(feat)) {
          FastVector featFV = new FastVector(1);
          featFV.addElement(feat);
          attributes.addElement(new Attribute(feat, featFV));// Nominal form
          featToAttr.put(feat, new Integer(i + numMetaFeats));
        }
      }
      // Add class attribute.
      int numClasses;
      if(surroundMode)
        numClasses = 2 * numLabels + 1; // count the null too, as value -1.
      else numClasses = numLabels + 1;
      FastVector classValues = new FastVector(numClasses);
      classValues.addElement("-1"); // The first class for null class
      for(int i = 1; i < numClasses; ++i)
        classValues.addElement(new Integer(i).toString());
      attributes.addElement(new Attribute("Class", classValues));
      // Create the dataset with capacity of all FVs, and set index of class
      instancesData = new Instances("NLPFeatureData", attributes, numDocs * 10);
      // The first attribute is for class.
      instancesData.setClassIndex(attributes.size() - 1);
      // Read data from file and copy the data into the instance;
      for(int iDoc = 0; iDoc < numDocs; ++iDoc) { // For each document
        items = inData.readLine().split(ConstantParameters.ITEMSEPARATOR);
        // The third item is for number of instances in the doc.
        int num = Integer.parseInt(items[2]);
        labelsFVDoc[iDoc] = new LabelsOfFeatureVectorDoc();
        labelsFVDoc[iDoc].multiLabels = new LabelsOfFV[num];
        for(int i = 0; i < num; ++i) { // For each instance
          items = inData.readLine().split(ConstantParameters.ITEMSEPARATOR);
          Instance inst = new Instance(numFeats);
          inst.setDataset(instancesData);
          int numLabel = Integer.parseInt(items[0]); // number of labels for
          // the instance
          entityTerm = "";
          numEntity = 0;
          // For each NLP feature term
          for(int j = numLabel + 1; j < items.length; ++j) {
            // Skip the feature if it is not in the list
            if(!allTerms.contains(items[j])) continue;
            if(isNgramFeat(items[j])) {// if it's a ngram
              items[j] = items[j].substring(0, items[j]
                .lastIndexOf(NLPFeaturesList.SYMBOLNGARM));
              inst.setValue(Integer.parseInt(featToAttr.get(items[j])
                .toString()), items[j]);
            } else {// if not a ngram
              // For real features, not "_NA"
              if(!items[j].equals(ConstantParameters.NAMENONFEATURE)) {
                // Get the feature term
                items[j] = items[j].substring(items[j].indexOf("_") + 1);
                // Entity name
                String feat1 = items[j].substring(0, items[j].indexOf("_"));
                // Feature name
                String feat2 = items[j].substring(items[j].indexOf("_") + 1);
                if(!feat1.equals(entityTerm)) {
                  numEntity = 0;
                  entityTerm = feat1;
                } else ++numEntity;
                feat1 = feat1
                  + entityToPosition.get(feat1 + "_" + numEntity).toString();
                inst.setValue(Integer
                  .parseInt(featToAttr.get(feat1).toString()), feat2);
              }
            }
          }
          if(trainingMode && numLabel > 0) {
            labelsFVDoc[iDoc].multiLabels[i] = new LabelsOfFV(numLabel);
            for(int j = 1; j <= numLabel; ++j) {
              inst.setClassValue(items[j]);
              instancesData.add(inst);
            }
          } else {
            labelsFVDoc[iDoc].multiLabels[i] = new LabelsOfFV(0);
            inst.setClassValue("-1"); // set as null class
            instancesData.add(inst);
          }
        }// end of the loop i
      }
      inData.close();
    } catch(FileNotFoundException e) {
      e.printStackTrace();
    } catch(IOException e) {
      e.printStackTrace();
    }
    return;
  }

  /** Check if the item is a n-gram or not. */
  private boolean isNgramFeat(String item) {
    if(item.contains(NLPFeaturesList.SYMBOLNGARM))
      return true;
    else return false;
  }

  /**
   * Determining a learner from Weka using the NLP. feature data or the feature
   * vector data.
   */
  public static short obtainWekaLeanerDataType(String learnerName) {
    if(learnerName.contains("C4.5") || learnerName.contains("NaiveBayes")) {
      return NLPFEATUREFVDATA;
    } else {
      return SPARSEFVDATA;
    }
  }

  /** Obtaining the Weka learners. */
  public static WekaLearner obtainWekaLearner(String learnerName,
    String learningOpts) {
    WekaLearner wekaL = null;
    if(learnerName.contains("KNN")) {
      if(learningOpts != null) {
        wekaL = new KNNIBK(learningOpts);
      } else wekaL = new KNNIBK();
    } else if(learnerName.contains("NaiveBayes")) {
      wekaL = new NaiveBayesC();
    } else if(learnerName.contains("C4.5")) {
      wekaL = new C45();
    }
    if(learningOpts != null) wekaL.getParametersFromOptionsLine(learningOpts);
    return wekaL;
  }

  /** Determing the output type of a Weka learner. */
  public static boolean obtainWekaLearnerOutputType(String learnerName) {
    /*
     * if(learnerName.contains("KNN")) { return true; } else
     * if(learnerName.contains("NaiveBayes")) { return true; } else
     * if(learnerName.contains("C45")) { return true; }
     */
    return true;
    // return false;
  }
}