/*
* Copyright (c) 1998-2005, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Valentin Tablan 19/11/2002
*
* $Id: MachineLearningPR.java 7307 2006-03-09 13:33:20 +0000 (Thu, 09 Mar 2006) ian_roberts $
*
*/
package gate.creole.ml;
import java.util.*;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import gate.*;
import gate.creole.*;
import gate.gui.ActionsPublisher;
import gate.util.*;
/**
* This processing resource is used to train a machine learning algorithm with
* data extracted from a corpus.
*/
public class MachineLearningPR extends AbstractLanguageAnalyser
implements gate.gui.ActionsPublisher{
public MachineLearningPR(){
actionList = new ArrayList();
actionList.add(null);
}
/**
* This will make sure that any resources allocated by an ml wrapper get
* released. This is needed in the case of those wrappers that call
* native code, as in such cases there is a need to realese dynamically
* allocated memory.
*/
public void cleanup() {
// First call cleanup in the parent, in case any clean up needs to be done
// there.
super.cleanup();
// So long as an ML Engine (wrapper) is associated with the processing
// resource, call its cleanup method.
if (engine!=null) {
engine.cleanUp();
}
}
/** Initialise this resource, and return it. */
public Resource init() throws ResourceInstantiationException {
if(configFileURL == null){
throw new ResourceInstantiationException(
"No configuration file provided!");
}
org.jdom.Document jdomDoc;
SAXBuilder saxBuilder = new SAXBuilder(false);
try {
try{
jdomDoc = saxBuilder.build(configFileURL);
}catch(JDOMException jde){
throw new ResourceInstantiationException(jde);
}
} catch (java.io.IOException ex) {
throw new ResourceInstantiationException(ex);
}
//go through the jdom document to extract the data we need
Element rootElement = jdomDoc.getRootElement();
if(!rootElement.getName().equals("ML-CONFIG"))
throw new ResourceInstantiationException(
"Root element of dataset defintion file is \"" + rootElement.getName() +
"\" instead of \"ML-CONFIG\"!");
//create the dataset defintion
Element datasetElement = rootElement.getChild("DATASET");
if(datasetElement == null) throw new ResourceInstantiationException(
"No dataset definition provided in the configuration file!");
try{
datasetDefinition = new DatasetDefintion(datasetElement);
}catch(GateException ge){
throw new ResourceInstantiationException(ge);
}
//create the engine
Element engineElement = rootElement.getChild("ENGINE");
if(engineElement == null) throw new ResourceInstantiationException(
"No engine option provided in the configuration file!");
Element engineClassElement = engineElement.getChild("WRAPPER");
if(engineClassElement == null) throw new ResourceInstantiationException(
"No ML engine class provided!");
String engineClassName = engineClassElement.getTextTrim();
try{
// load MLEngine class from GATE Classloader
Class engineClass =
Class.forName(engineClassName, true, Gate.getClassLoader());
engine = (MLEngine)engineClass.newInstance();
}catch(ClassNotFoundException cnfe){
throw new ResourceInstantiationException(
"ML engine class:" + engineClassName + "not found!");
}catch(IllegalAccessException iae){
throw new ResourceInstantiationException(iae);
}catch(InstantiationException ie){
throw new ResourceInstantiationException(ie);
}
// See if batch classification mode had been set.
if (engineElement.getChild("BATCH-MODE-CLASSIFICATION") == null) {
batchModeClassification = false;
} else {
// checks wether the engine supports batch mode
// engines must implement AdvancedMLEngine (extending MLengine)
// to be asked about this functionality
if (engine instanceof AdvancedMLEngine){
batchModeClassification = ((AdvancedMLEngine)engine).supportsBatchMode();
}
else batchModeClassification = false;
}
engine.setDatasetDefinition(datasetDefinition);
engine.setOptions(engineElement.getChild("OPTIONS"));
engine.setOwnerPR(this);
try{
engine.init();
}catch(GateException ge){
throw new ResourceInstantiationException(ge);
}
return this;
} // init()
/**
* Run the resource.
*/
public void execute() throws ExecutionException {
interrupted = false;
//check the input
if (document == null) {
throw new ExecutionException(
"No document provided!"
);
}
if (inputASName == null ||
inputASName.equals(""))
annotationSet = document.getAnnotations();
else
annotationSet = document.getAnnotations(inputASName);
if (training.booleanValue()) {
fireStatusChanged(
"Collecting training data from " + document.getName() + "...");
}
else {
fireStatusChanged(
"Applying ML model to " + document.getName() + "...");
}
fireProgressChanged(0);
AnnotationSet anns = annotationSet.
get(datasetDefinition.getInstanceType());
annotations = (anns == null || anns.isEmpty()) ?
new ArrayList() : new ArrayList(anns);
Collections.sort(annotations, new OffsetComparator());
Iterator annotationIter = annotations.iterator();
int index = 0;
int size = annotations.size();
//create the cache structure
cache = new Cache();
if (!batchModeClassification || training.booleanValue()) {
// This code covers the case when instances are going to be passed to
// the wrapper one at a time, which is always the case with training,
// and the case with classification when we are not using batch mode.
while (annotationIter.hasNext()) {
Annotation instanceAnn = (Annotation) annotationIter.next();
List attributeValues = new ArrayList(datasetDefinition.
getAttributes().size());
//find the values for all attributes
Iterator attrIter = datasetDefinition.getAttributes().iterator();
while (attrIter.hasNext()) {
Attribute attr = (Attribute) attrIter.next();
if (attr.isClass && !training.booleanValue()) {
//we're not training so the class will be undefined
attributeValues.add(null);
}
else {
attributeValues.add(cache.getAttributeValue(index, attr));
}
}
if (training.booleanValue()) {
engine.addTrainingInstance(attributeValues);
}
else {
Object result = engine.classifyInstance(attributeValues);
if (result instanceof Collection) {
Iterator resIter = ( (Collection) result).iterator();
while (resIter.hasNext())
updateDocument(resIter.next(), index);
}
else {
updateDocument(result, index);
}
}
cache.shift();
//every 10 instances fire an event
if (index % 10 == 0) {
fireProgressChanged(index * 100 / size);
if (isInterrupted())
throw new ExecutionInterruptedException();
}
index++;
}
}
else {
// This code covers the case when all the instances in a document will be
// passed to the
// wrapper as a batch. This is necessary to achieve efficient performance
// with some wrappers.
// This list is needed to collect all the test instances.
List instancesToBeClassified = new ArrayList();
while (annotationIter.hasNext()) {
Annotation instanceAnn = (Annotation) annotationIter.next();
List attributeValues = new ArrayList(datasetDefinition.
getAttributes().size());
//find the values for all attributes
Iterator attrIter = datasetDefinition.getAttributes().iterator();
while (attrIter.hasNext()) {
Attribute attr = (Attribute) attrIter.next();
if (attr.isClass) {
//we're not training so the class will be undefined
attributeValues.add(null);
}
else {
attributeValues.add(cache.getAttributeValue(index, attr));
}
}
// Instead of classifying the instance, just add it to the list of
// instances that need classifying.
instancesToBeClassified.add(attributeValues);
cache.shift();
index++;
}
// Now all the data is collected in instances to be classified, we can
// actually get the wrapper to classify them.
List classificationResults = engine.batchClassifyInstances(
instancesToBeClassified);
// Now go through the document and add all the annotations appropriately,
// given the output of the wrapper.
// Start with the first instance again.
index = 0;
Iterator resultsIterator = classificationResults.iterator();
while (resultsIterator.hasNext()) {
Object result = resultsIterator.next();
if (result instanceof Collection) {
Iterator resIter = ( (Collection) result).iterator();
while (resIter.hasNext())
updateDocument(resIter.next(), index);
}
else {
updateDocument(result, index);
}
// Move index on so that it points at the next instance.
index++;
}
}
annotations = null;
} // execute()
protected void updateDocument(Object classificationResult, int instanceIndex){
//interpret the result according to the attribute semantics
Attribute classAttr = datasetDefinition.getClassAttribute();
String type = classAttr.getType();
String feature = classAttr.getFeature();
List classValues = classAttr.getValues();
FeatureMap features = Factory.newFeatureMap();
boolean shouldCreateAnnotation = true;
if(classValues != null && !classValues.isEmpty()){
//nominal attribute -> AnnotationType.feature
//the result is the value for the feature
String featureValue = (String)classificationResult;
features.put(feature, featureValue);
}else{
if(feature == null){
//boolean attribute
shouldCreateAnnotation = classificationResult.equals("true");
}else{
//numeric attribute
String featureValue = classificationResult.toString();
features.put(feature, featureValue);
}
}
if(shouldCreateAnnotation){
//generate the new annotation
int coveredInstanceIndex = instanceIndex + classAttr.getPosition();
if(coveredInstanceIndex >= 0 &&
coveredInstanceIndex < annotations.size()){
Annotation coveredInstance = (Annotation)annotations.
get(coveredInstanceIndex);
annotationSet.add(coveredInstance.getStartNode(),
coveredInstance.getEndNode(),
type, features);
}
}
}
/**
* Gets the list of actions that can be performed on this resource.
* @return a List of Action objects (or null values)
*/
public List getActions(){
List result = new ArrayList();
result.addAll(actionList);
if(engine instanceof ActionsPublisher){
result.addAll(((ActionsPublisher)engine).getActions());
}
return result;
}
protected class Cache{
public Cache(){
//find the sizes for the two caches
int forwardCacheSize = 0;
int backwardCacheSize = 0;
Iterator attrIter = datasetDefinition.getAttributes().iterator();
while(attrIter.hasNext()){
Attribute anAttribute = (Attribute)attrIter.next();
if(anAttribute.getPosition() > 0){
//forward looking
if(anAttribute.getPosition() > forwardCacheSize){
forwardCacheSize = anAttribute.getPosition();
}
}else if(anAttribute.getPosition() < 0){
//backward looking
if(-anAttribute.getPosition() > backwardCacheSize){
backwardCacheSize = -anAttribute.getPosition();
}
}
}
//create the caches filled with null values
forwardCache = new ArrayList(forwardCacheSize);
for(int i =0; i < forwardCacheSize; i++) forwardCache.add(null);
backwardCache = new ArrayList(backwardCacheSize);
for(int i =0; i < backwardCacheSize; i++) backwardCache.add(null);
}
/**
* Finds the value of a specified attribute for a particular instance.
* @param instanceIndex the index of the current instance in the annotations
* List.
* @param attribute the attribute whose value needs to be found
* @return a String representing the value for the attribute.
*/
public String getAttributeValue(int instanceIndex, Attribute attribute){
//sanity check
int actualPosition = instanceIndex + attribute.getPosition();
if(actualPosition < 0 || actualPosition >= annotations.size()) return null;
//check caches first
if(attribute.getPosition() == 0){
//current instance
if(currentAttributes == null) currentAttributes = new HashMap();
return getValue(attribute, instanceIndex, currentAttributes);
}else if(attribute.getPosition() > 0){
//check forward cache
Map attributesMap = (Map)forwardCache.get(attribute.getPosition() - 1);
if(attributesMap == null){
attributesMap = new HashMap();
forwardCache.set(attribute.getPosition() - 1, attributesMap);
}
return getValue(attribute, actualPosition, attributesMap);
}else if(attribute.getPosition() < 0){
//check bacward cache
Map attributesMap = (Map)backwardCache.get(-attribute.getPosition() - 1);
if(attributesMap == null){
attributesMap = new HashMap();
backwardCache.set(-attribute.getPosition() - 1, attributesMap);
}
return getValue(attribute, actualPosition, attributesMap);
}
//we should never get here
throw new LuckyException(
"Attribute position is neither 0, nor negative nor positive!");
}
/**
* Notifies the cache that it should advance its internal structures one
* step forward.
*/
public void shift(){
if(backwardCache.isEmpty()){
//no backward caching, all attributes have position "0" or more
//nothing to do
}else{
backwardCache.remove(backwardCache.size() - 1);
backwardCache.add(0, currentAttributes);
}
if(forwardCache.isEmpty()){
//no forward caching, all attributes have position "0" or less
if(currentAttributes != null) currentAttributes.clear();
}else{
currentAttributes = (Map) forwardCache.remove(0);
forwardCache.add(null);
}
}
/**
* Finds the value for a particular attribute and returns it.
* If the value is not present in the cache it will be retrieved from the
* document and the cache will be updated.
* @param attribute the attribute whose value is requested.
* @param cache the Map containing the cache for the appropriate position
* for the attribute
* @param instanceIndex the index of the instance annotation which is
* covered by the sought attribute
* @return a String value.
*/
protected String getValue(Attribute attribute,
int instanceIndex,
Map cache){
String value = null;
String annType = attribute.getType();
String featureName = attribute.getFeature();
Map typeData = (Map)cache.get(annType);
if(typeData != null){
if(featureName == null){
//we're only interested in the presence of the annotation
value = (String)typeData.get(null);
}else{
value = (String)typeData.get(featureName);
}
}else{
//type data was null -> nothing known about this type of annotations
//get the insformation; update the cache and return the right value
Annotation instanceAnnot = (Annotation)annotations.get(instanceIndex);
typeData = new HashMap();
cache.put(annType, typeData);
// The annotation retrieved by its index is in a default type
// (default : Token). We need to search for overlapping types
// only if the Type needed is not the one we already have
// (which seems quite reasonable given that most Attributes are
// likely to be based on Token informations)
if (instanceAnnot.getType().equals(annType)){
typeData.putAll(instanceAnnot.getFeatures());
typeData.put(null, "true");
String stringvalue = (String)typeData.get(featureName);
if(featureName == null) return "true";
return stringvalue;
}
// here we search for annotations of another type
// first restrict to the needed type
// then limit to those covering the current token
AnnotationSet typeSubset = annotationSet.get(annType);
AnnotationSet coverSubset = null;
if (typeSubset!=null) coverSubset = typeSubset.get(
annType,
instanceAnnot.getStartNode().getOffset(),
instanceAnnot.getEndNode().getOffset());
if(coverSubset == null || coverSubset.isEmpty()){
//no such annotations at given location
typeData.put(null, "false");
if(featureName == null) value = "false";
else value = null;
}else{
typeData.putAll(((Annotation)coverSubset.iterator().next()).
getFeatures());
typeData.put(null, "true");
if(featureName == null) value = "true";
else value = (String)typeData.get(featureName);
}
}
return value;
}
/**
* Stores cached data with attribute values for instances placed
* <b>after</b> the current instance.
* For each instance (i.e. for each position in the list) the data is a Map
* with annotationTypes as keys. For each annotation type the data stored is
* another Map with feature names as keys and feature values as values.
* The <tt>null</tt> key is used for a boolean value (stored as one of the
* "true" or "false" strings) signifying the presence
* (or lack of presence) of the required type of annotation at the location.
* forwardCache[2].get("Lookup").get(null) == "false" means that no lookup
* annotation covers the second instance to the right from the current
* instance.
*/
protected List forwardCache;
/**
* Stores cached data with attribute values for instances placed
* <b>before</b> the current instance.
* For each instance (i.e. for each position in the list) the data is a Map
* with annotationTypes as keys. For each annotation type the data stored is
* another Map with feature names as keys and feature values as values.
* The <tt>null</tt> key is used for a boolean value (stored as one of the
* "true" or "false" strings) signifying the presence
* (or lack of presence) of the required type of annotation at the location.
* backwardCache[2].get("Lookup").get(null) == "false" means that no lookup
* annotation covers the second instance to the left from the current
* instance.
*/
protected List backwardCache;
/**
* A Map
* with annotationTypes as keys. For each annotation type the data stored is
* another Map with feature names as keys and feature values as values.
* The <tt>null</tt> key is used for a boolean value (stored as one of the
* "true" or "false" strings) signifying the presence
* (or lack of presence) of the required type of annotation at the location.
* currentAttributes.get(Lookup).get(null) == "false" means that the current
* instance is not covered by a Lookup annotation.
* currentAttributes.get(Lookup) == null menas nothing is known about Lookup
* annotations caovering the current instance.
*/
protected Map currentAttributes;
}
public void setInputASName(String inputASName) {
this.inputASName = inputASName;
}
public String getInputASName() {
return inputASName;
}
public java.net.URL getConfigFileURL() {
return configFileURL;
}
public void setConfigFileURL(java.net.URL configFileURL) {
this.configFileURL = configFileURL;
}
public void setTraining(Boolean training) {
this.training = training;
}
public Boolean getTraining() {
return training;
}
public MLEngine getEngine() {
return engine;
}
public void setEngine(MLEngine engine) {
this.engine = engine;
}
private java.net.URL configFileURL;
protected DatasetDefintion datasetDefinition;
protected MLEngine engine;
protected String inputASName;
protected AnnotationSet annotationSet;
protected List annotations;
protected List actionList;
protected Cache cache;
private Boolean training;
/**
* This member will be set to true if instances are to be passed to the
* wrapper in batches, rather than one instance at a time and if the engine
* supports this functionality.
*/
protected boolean batchModeClassification;
}