Log in Help
Print
Homereleasesgate-5.1-beta2-build3402-ALLpluginsMachine_Learningsrcgatecreolemlsvmlight 〉 SVMLightDocument.java
 
/*
 *  Copyright (c) 2004, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Mike Dowman 08-04-2004
 *
 *  $Id: SVMLightDocument.java 7452 2006-06-15 14:45:17 +0000 (Thu, 15 Jun 2006) ian_roberts $
 *
 */

package gate.creole.ml.svmlight;

import gate.creole.ml.Attribute;
import java.lang.String;

/**
 * An array of these objects is what will be passed to the native methods from
 * GATE. It contains a single instance (a.k.a. document in SVM Light
 * terminology), and the instance is organised in a form that makes it easily
 * readable by the native methods.
 *
 * Native methods are no longer used, but the toString method of this class
 * will convert to SVM_light format so that the document can be saved to disk
 * in a form readable by svm_light.
 *
 * N.B. Weighting factors are now used, so that a new parameter
 * <WEIGHTING> can be specified in the configuration file for each
 * attribute. The attribute's value will be multiplied by this number before
 * being placed in the svm_light format document.
 */
class SVMLightDocument {

  static final boolean DEBUG=false;

  /**
   * The value of the <CLASS/> attribute, also known as the target.
   */
  double classValue;
  /**
   * The number of each feature.
   */
  int[] featureNumbers;
  /**
   * The value of each feature. These values correspond to those in
   * featureNumbers, so the nth value in this array corresponds to the nth in
   * the other.
   */
  double[] featureValues;

  /**
   * Creates a SVMLightDocument from a list of attributes.
   */
  SVMLightDocument(gate.creole.ml.DatasetDefintion datasetDefinition,
                   java.util.Map nominalValue2IntegerHash,
                   java.util.List attributes) throws gate.creole.
      ExecutionException {
    if (DEBUG)
      System.out.println("Starting to make document");

    // First set the class attribute.
    setClassAttribute(datasetDefinition, nominalValue2IntegerHash, attributes);

    // Now create the arrays containing all the other attributes.
    createFeatureValuePairs(datasetDefinition, nominalValue2IntegerHash,
                            attributes);

    if (DEBUG)
      System.out.println("Document made");
  }

  /**
   * Return a string representing the document in SVM Light format (i.e. as a
   * single line of an SVM Light data file).
   */
  public String toString() {
    // The string consists of the class value, followed by all the feature-value
    // pairs in the format feature number:feature value, all separated by
    // spaces, with a new line character on the end.
    StringBuffer svmFormatString = new StringBuffer(""+classValue+" ");

    for (int i=0; i<featureNumbers.length; ++i) {
      svmFormatString.append(""+featureNumbers[i]+":"+featureValues[i]+" ");
    }

    // Using the + operator, not append, makes the string buffer get converted
    // to a String.
    return svmFormatString+"\n";
  }

  /**
   * Convert the attributes, other than the class attribute, into arrays, one
   * storing feature numbers, and another feature values. Store these arrays
   * within the object's members.
   *
   * @param datasetDefinition An object specifying the dataset, which is derived
   * from the configuration file.
   * @param attributes All the attributes in list form, as received from the
   * gate Machine learning processing resource.
   */
  private void createFeatureValuePairs(
      gate.creole.ml.DatasetDefintion datasetDefinition,
      java.util.Map nominalValue2IntegerHash,
      java.util.List attributes) throws gate.creole.ExecutionException {
    java.util.List featureNumbersList = new java.util.ArrayList();
    java.util.List featureValuesList = new java.util.ArrayList();
    // SVM wants the attributes (a.k.a. features) numbering from 1.
    int svmLightFeatureNumber = 1;
    // In datasetDefinition, the attributes are numbered from 0.
    int attributesIndex = 0;
    while (attributesIndex < attributes.size()) {
      // Skip the class attribute.
      if (attributesIndex != datasetDefinition.getClassIndex()) {
        // Nominal attributes are treated differently to boolean and numeric
        // ones, becasue each nominal attribute can map to many svm features.
        // This is because we have one feature per possible value, and this is
        // one if the feature is present, else zero.
        Attribute currentAttribute =
            (Attribute) datasetDefinition.getAttributes().get(attributesIndex);
        if (currentAttribute.semanticType() == Attribute.NOMINAL) {
          // Find out the index of the attribute, in terms of which of the
          // possible values it has taken.
          int nominalNumber =
              nominalValue2Integer(datasetDefinition, nominalValue2IntegerHash,
                                   attributesIndex,
                                   (String) attributes.get(attributesIndex));

          // The SVM light features will be numbered, with one for each of the
          // possible values of the nominal. But as we only actually add those
          // which are non-zero, we can just skip over those before the one
          // corresponding to the actual value of the attribute.
          svmLightFeatureNumber+=nominalNumber-1;

          // So long as the attribute has been recognised, add it. Don't do
          // anything for unrecognised or missing attributes.
          if (nominalNumber<=((Attribute)datasetDefinition.getAttributes()
               .get(attributesIndex)).getValues().size()) {
            featureNumbersList.add(new java.lang.Integer(svmLightFeatureNumber));
            // The feature value will always be 1, to indicate that the nominal
            // has the value corresponding to this feature, except that this
            // value will be weighted if a weighting is specified in the
            // configuration file.
            featureValuesList.add(
                new java.lang.Double(1.0 * currentAttribute.getWeighting()));
            // Now move on svmLightFeatureNumber so that it corresponds to the
            // first feature after the end of those representing this nominal.
          }
          svmLightFeatureNumber+=
              ((Attribute)datasetDefinition.getAttributes()
               .get(attributesIndex)).getValues().size()-nominalNumber+1;
        }
        else { // The following code is for boolean and numeric attributes.
          // Only add attributes if their value is not zero.
          double attributeValue = string2AttributeValue(datasetDefinition,
              nominalValue2IntegerHash,
              (String) attributes.get(attributesIndex), attributesIndex);
          if (attributeValue != 0.0) {
            featureNumbersList.add(new java.lang.Integer(svmLightFeatureNumber));
            featureValuesList.add(new java.lang.Double(attributeValue));
          }
          ++svmLightFeatureNumber;
        }
      }
      ++attributesIndex;
    }

    // N.B. This is not really efficient - it would be better to find some
    // library class that supports lists for primitive types, and allows them
    // to be converted to arrays efficiently.
    featureNumbers = intList2Array(featureNumbersList);
    featureValues = doubleList2Array(featureValuesList);
  }

  /**
   * Take a list containing Integer classes, and convert it to an array of
   * integers.
   */
  private int[] intList2Array(java.util.List integerList) {
    int[] integerArray = new int[integerList.size()];
    for (int index = 0; index < integerList.size(); ++index) {
      integerArray[index] = ( (Integer) integerList.get(index)).intValue();
    }
    return integerArray;
  }

  /**
   * Take a list containing Double classes, and convert it to an array of
   * doubles.
   */
  private double[] doubleList2Array(java.util.List doubleList) {
    double[] doubleArray = new double[doubleList.size()];
    for (int index = 0; index < doubleList.size(); ++index) {
      doubleArray[index] = ( (Double) doubleList.get(index)).doubleValue();
    }
    return doubleArray;
  }

  /**
   * Extract the class attribute from the attributes list, and set the member
   * variable appropriately.
   *
   * @param datasetDefinition An object describing the dataset as specified in
   * the configuration file.
   * @param attributes The list of all the attribute values, in the order in
   * which they appear in the configuratoin file.
   */
  private void setClassAttribute(
      gate.creole.ml.DatasetDefintion datasetDefinition,
      java.util.Map nominalValue2IntegerHash,
      java.util.List attributes) throws gate.creole.ExecutionException {
    Attribute classAttribute = (Attribute) datasetDefinition.getClassAttribute();
    String stringClassValue =
        (String) attributes.get(datasetDefinition.getClassIndex());

    // It's OK if their is no class value, so long as we are doing
    // classification not training.
    if (stringClassValue==null)
      classValue = 0;
    else
      classValue = string2AttributeValue(datasetDefinition,
                                         nominalValue2IntegerHash,
                                         stringClassValue,
                                         datasetDefinition.getClassIndex());
  }

  /**
   * Take an attribute value string, in the form that it's received from gate,
   * and convert it into the form required by svm light. Note that nominal
   * attributes are treated differently if they are class attributes, as opposed
   * to normal ones.
   *
   * Also each attribute value will always be multiplied by its weighting, as
   * specified in the configuration file.
   */
  private double string2AttributeValue(
      gate.creole.ml.DatasetDefintion datasetDefinition,
      java.util.Map nominalValue2IntegerHash,
      String stringClassValue, int attributeIndex) throws gate.creole.
      ExecutionException {
    Attribute attributeObject =
        (Attribute) datasetDefinition.getAttributes().get(attributeIndex);

    if (attributeObject.semanticType() == Attribute.BOOLEAN) {
      if (stringClassValue.equals("true")) {
        return 1.0 * attributeObject.getWeighting();
      }
      else {
        return -1.0 * attributeObject.getWeighting();
      }
    }
    else if (attributeObject.semanticType() == Attribute.NOMINAL) {
      int attributeValueNumber = nominalValue2Integer(datasetDefinition,
          nominalValue2IntegerHash,
          datasetDefinition.getClassIndex(), stringClassValue);
      // N.B. If a nomianl attribute is the class attribute, it is treated
      // differently to other nominal attributes. It must be set to -1, +1 or
      // 0. (0 indicates that transduction is to be used.)
      if (attributeObject.isClass()) {
        if (attributeValueNumber == 1.0) {
          return 1.0 * attributeObject.getWeighting(); // Indicate positive example.
        }
        else if (attributeValueNumber == 2.0) {
          return -1.0 * attributeObject.getWeighting(); // Indicate negative example.
        }
        else {
          return 0.0; // Indicated unclassified example - or possibly an example
                      // for which the class attribute is missing.
        }
      }
      else {
        // We no longer use this code to map nominal attributes other than the
        // class attribute, so if we get here it is an error in the code.
      }
    }

    // In this case the attributes must be NUMERIC
    try {
      return Double.parseDouble(stringClassValue) * attributeObject.getWeighting();
    } catch (Exception ex) {
      // If a numeric value is missing, or is not a valid number just give it
      // a zero value.
      return 0;
    }
  }

  /**
   * Change a nominal feature value into an integer. Features are numbered from
   * one in the order in which they are declared in the configuration file.
   *
   * This code uses the nominalValue2IntegerHashMap, that is created when the
   * wrapper is initialised, so quickly map from an attribute number and feature
   * value in the format passed from gate into an integer.
   *
   * @param datasetDefintition
   * @param indexOfAttribute
   * @param value
   * @return
   */
  private int nominalValue2Integer(
      gate.creole.ml.DatasetDefintion datasetDefinition,
      java.util.Map nominalValue2IntegerHash,
      int indexOfAttribute, String value)
      throws gate.creole.ExecutionException {

    if (nominalValue2IntegerHash.containsKey(""+indexOfAttribute+":"+value)) {
      return ((Integer)nominalValue2IntegerHash.
              get(""+indexOfAttribute+":"+value)).intValue();
    } else
      // If we get here, then we will have an unrecognised value, or there will
      // be no value (possibly because we are looking before the beginning or
      // after the end of the document. We mark such cases with a distinct
      // value, with a value one greater than that of the last real value.
      return ((Attribute)datasetDefinition.getAttributes()
              .get(indexOfAttribute)).getValues().size()+1;
  }

}