/*
* Copyright (c) 2004, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Mike Dowman 08-04-2004
*
* $Id: SVMLightDocument.java 7452 2006-06-15 14:45:17 +0000 (Thu, 15 Jun 2006) ian_roberts $
*
*/
package gate.creole.ml.svmlight;
import gate.creole.ml.Attribute;
import java.lang.String;
/**
* An array of these objects is what will be passed to the native methods from
* GATE. It contains a single instance (a.k.a. document in SVM Light
* terminology), and the instance is organised in a form that makes it easily
* readable by the native methods.
*
* Native methods are no longer used, but the toString method of this class
* will convert to SVM_light format so that the document can be saved to disk
* in a form readable by svm_light.
*
* N.B. Weighting factors are now used, so that a new parameter
* <WEIGHTING> can be specified in the configuration file for each
* attribute. The attribute's value will be multiplied by this number before
* being placed in the svm_light format document.
*/
class SVMLightDocument {
static final boolean DEBUG=false;
/**
* The value of the <CLASS/> attribute, also known as the target.
*/
double classValue;
/**
* The number of each feature.
*/
int[] featureNumbers;
/**
* The value of each feature. These values correspond to those in
* featureNumbers, so the nth value in this array corresponds to the nth in
* the other.
*/
double[] featureValues;
/**
* Creates a SVMLightDocument from a list of attributes.
*/
SVMLightDocument(gate.creole.ml.DatasetDefintion datasetDefinition,
java.util.Map nominalValue2IntegerHash,
java.util.List attributes) throws gate.creole.
ExecutionException {
if (DEBUG)
System.out.println("Starting to make document");
// First set the class attribute.
setClassAttribute(datasetDefinition, nominalValue2IntegerHash, attributes);
// Now create the arrays containing all the other attributes.
createFeatureValuePairs(datasetDefinition, nominalValue2IntegerHash,
attributes);
if (DEBUG)
System.out.println("Document made");
}
/**
* Return a string representing the document in SVM Light format (i.e. as a
* single line of an SVM Light data file).
*/
public String toString() {
// The string consists of the class value, followed by all the feature-value
// pairs in the format feature number:feature value, all separated by
// spaces, with a new line character on the end.
StringBuffer svmFormatString = new StringBuffer(""+classValue+" ");
for (int i=0; i<featureNumbers.length; ++i) {
svmFormatString.append(""+featureNumbers[i]+":"+featureValues[i]+" ");
}
// Using the + operator, not append, makes the string buffer get converted
// to a String.
return svmFormatString+"\n";
}
/**
* Convert the attributes, other than the class attribute, into arrays, one
* storing feature numbers, and another feature values. Store these arrays
* within the object's members.
*
* @param datasetDefinition An object specifying the dataset, which is derived
* from the configuration file.
* @param attributes All the attributes in list form, as received from the
* gate Machine learning processing resource.
*/
private void createFeatureValuePairs(
gate.creole.ml.DatasetDefintion datasetDefinition,
java.util.Map nominalValue2IntegerHash,
java.util.List attributes) throws gate.creole.ExecutionException {
java.util.List featureNumbersList = new java.util.ArrayList();
java.util.List featureValuesList = new java.util.ArrayList();
// SVM wants the attributes (a.k.a. features) numbering from 1.
int svmLightFeatureNumber = 1;
// In datasetDefinition, the attributes are numbered from 0.
int attributesIndex = 0;
while (attributesIndex < attributes.size()) {
// Skip the class attribute.
if (attributesIndex != datasetDefinition.getClassIndex()) {
// Nominal attributes are treated differently to boolean and numeric
// ones, becasue each nominal attribute can map to many svm features.
// This is because we have one feature per possible value, and this is
// one if the feature is present, else zero.
Attribute currentAttribute =
(Attribute) datasetDefinition.getAttributes().get(attributesIndex);
if (currentAttribute.semanticType() == Attribute.NOMINAL) {
// Find out the index of the attribute, in terms of which of the
// possible values it has taken.
int nominalNumber =
nominalValue2Integer(datasetDefinition, nominalValue2IntegerHash,
attributesIndex,
(String) attributes.get(attributesIndex));
// The SVM light features will be numbered, with one for each of the
// possible values of the nominal. But as we only actually add those
// which are non-zero, we can just skip over those before the one
// corresponding to the actual value of the attribute.
svmLightFeatureNumber+=nominalNumber-1;
// So long as the attribute has been recognised, add it. Don't do
// anything for unrecognised or missing attributes.
if (nominalNumber<=((Attribute)datasetDefinition.getAttributes()
.get(attributesIndex)).getValues().size()) {
featureNumbersList.add(new java.lang.Integer(svmLightFeatureNumber));
// The feature value will always be 1, to indicate that the nominal
// has the value corresponding to this feature, except that this
// value will be weighted if a weighting is specified in the
// configuration file.
featureValuesList.add(
new java.lang.Double(1.0 * currentAttribute.getWeighting()));
// Now move on svmLightFeatureNumber so that it corresponds to the
// first feature after the end of those representing this nominal.
}
svmLightFeatureNumber+=
((Attribute)datasetDefinition.getAttributes()
.get(attributesIndex)).getValues().size()-nominalNumber+1;
}
else { // The following code is for boolean and numeric attributes.
// Only add attributes if their value is not zero.
double attributeValue = string2AttributeValue(datasetDefinition,
nominalValue2IntegerHash,
(String) attributes.get(attributesIndex), attributesIndex);
if (attributeValue != 0.0) {
featureNumbersList.add(new java.lang.Integer(svmLightFeatureNumber));
featureValuesList.add(new java.lang.Double(attributeValue));
}
++svmLightFeatureNumber;
}
}
++attributesIndex;
}
// N.B. This is not really efficient - it would be better to find some
// library class that supports lists for primitive types, and allows them
// to be converted to arrays efficiently.
featureNumbers = intList2Array(featureNumbersList);
featureValues = doubleList2Array(featureValuesList);
}
/**
* Take a list containing Integer classes, and convert it to an array of
* integers.
*/
private int[] intList2Array(java.util.List integerList) {
int[] integerArray = new int[integerList.size()];
for (int index = 0; index < integerList.size(); ++index) {
integerArray[index] = ( (Integer) integerList.get(index)).intValue();
}
return integerArray;
}
/**
* Take a list containing Double classes, and convert it to an array of
* doubles.
*/
private double[] doubleList2Array(java.util.List doubleList) {
double[] doubleArray = new double[doubleList.size()];
for (int index = 0; index < doubleList.size(); ++index) {
doubleArray[index] = ( (Double) doubleList.get(index)).doubleValue();
}
return doubleArray;
}
/**
* Extract the class attribute from the attributes list, and set the member
* variable appropriately.
*
* @param datasetDefinition An object describing the dataset as specified in
* the configuration file.
* @param attributes The list of all the attribute values, in the order in
* which they appear in the configuratoin file.
*/
private void setClassAttribute(
gate.creole.ml.DatasetDefintion datasetDefinition,
java.util.Map nominalValue2IntegerHash,
java.util.List attributes) throws gate.creole.ExecutionException {
Attribute classAttribute = (Attribute) datasetDefinition.getClassAttribute();
String stringClassValue =
(String) attributes.get(datasetDefinition.getClassIndex());
// It's OK if their is no class value, so long as we are doing
// classification not training.
if (stringClassValue==null)
classValue = 0;
else
classValue = string2AttributeValue(datasetDefinition,
nominalValue2IntegerHash,
stringClassValue,
datasetDefinition.getClassIndex());
}
/**
* Take an attribute value string, in the form that it's received from gate,
* and convert it into the form required by svm light. Note that nominal
* attributes are treated differently if they are class attributes, as opposed
* to normal ones.
*
* Also each attribute value will always be multiplied by its weighting, as
* specified in the configuration file.
*/
private double string2AttributeValue(
gate.creole.ml.DatasetDefintion datasetDefinition,
java.util.Map nominalValue2IntegerHash,
String stringClassValue, int attributeIndex) throws gate.creole.
ExecutionException {
Attribute attributeObject =
(Attribute) datasetDefinition.getAttributes().get(attributeIndex);
if (attributeObject.semanticType() == Attribute.BOOLEAN) {
if (stringClassValue.equals("true")) {
return 1.0 * attributeObject.getWeighting();
}
else {
return -1.0 * attributeObject.getWeighting();
}
}
else if (attributeObject.semanticType() == Attribute.NOMINAL) {
int attributeValueNumber = nominalValue2Integer(datasetDefinition,
nominalValue2IntegerHash,
datasetDefinition.getClassIndex(), stringClassValue);
// N.B. If a nomianl attribute is the class attribute, it is treated
// differently to other nominal attributes. It must be set to -1, +1 or
// 0. (0 indicates that transduction is to be used.)
if (attributeObject.isClass()) {
if (attributeValueNumber == 1.0) {
return 1.0 * attributeObject.getWeighting(); // Indicate positive example.
}
else if (attributeValueNumber == 2.0) {
return -1.0 * attributeObject.getWeighting(); // Indicate negative example.
}
else {
return 0.0; // Indicated unclassified example - or possibly an example
// for which the class attribute is missing.
}
}
else {
// We no longer use this code to map nominal attributes other than the
// class attribute, so if we get here it is an error in the code.
}
}
// In this case the attributes must be NUMERIC
try {
return Double.parseDouble(stringClassValue) * attributeObject.getWeighting();
} catch (Exception ex) {
// If a numeric value is missing, or is not a valid number just give it
// a zero value.
return 0;
}
}
/**
* Change a nominal feature value into an integer. Features are numbered from
* one in the order in which they are declared in the configuration file.
*
* This code uses the nominalValue2IntegerHashMap, that is created when the
* wrapper is initialised, so quickly map from an attribute number and feature
* value in the format passed from gate into an integer.
*
* @param datasetDefintition
* @param indexOfAttribute
* @param value
* @return
*/
private int nominalValue2Integer(
gate.creole.ml.DatasetDefintion datasetDefinition,
java.util.Map nominalValue2IntegerHash,
int indexOfAttribute, String value)
throws gate.creole.ExecutionException {
if (nominalValue2IntegerHash.containsKey(""+indexOfAttribute+":"+value)) {
return ((Integer)nominalValue2IntegerHash.
get(""+indexOfAttribute+":"+value)).intValue();
} else
// If we get here, then we will have an unrecognised value, or there will
// be no value (possibly because we are looking before the beginning or
// after the end of the document. We mark such cases with a distinct
// value, with a value one greater than that of the last real value.
return ((Attribute)datasetDefinition.getAttributes()
.get(indexOfAttribute)).getValues().size()+1;
}
}