GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/LingPipe/src/gate/lingpipe/POSTaggerPR.java

package gate.lingpipe;

import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.hmm.TagWordLattice;
import com.aliasi.util.ScoredObject;
import com.aliasi.util.Streams;

/**
 * POS tagger based on the LingPipe library.
 * @author gate
 *
 */
public class POSTaggerPR extends AbstractLanguageAnalyser implements
                                                       ProcessingResource {

  /** File which cotains model for NE */
  protected URL modelFileUrl;

  /** Model file extracted from the URL */
  protected File modelFile;

  /** Model decoder object */
  protected HmmDecoder decoder;

  /** The name of the annotation set used for input */
  protected String inputASName;

  /**
   * Number of best results to obtain from the model
   */
  protected Integer nBest = 5;

  /**
   * The application mode
   */
  POSApplicationMode applicationMode;

  /**
   * Initializes this resource
   * 
   * @return Resource
   * @throws ResourceInstantiationException
   */
  public Resource init() throws ResourceInstantiationException {
    if(modelFileUrl == null)
      throw new ResourceInstantiationException("No model file provided!");

    try {
      modelFile = new File(modelFileUrl.toURI());
    }
    catch(URISyntaxException e) {
      throw new ResourceInstantiationException(e);
    }

    if(modelFile == null || !modelFile.exists()) {
      throw new ResourceInstantiationException("modelFile:"
              + modelFileUrl.toString() + " does not exists");
    }

    try {
      FileInputStream fileIn = new FileInputStream(modelFile);
      ObjectInputStream objIn = new ObjectInputStream(fileIn);
      HiddenMarkovModel hmm = (HiddenMarkovModel)objIn.readObject();
      Streams.closeInputStream(objIn);
      decoder = new HmmDecoder(hmm);
    }
    catch(IOException ioe) {
      throw new ResourceInstantiationException(ioe);
    }
    catch(ClassNotFoundException e) {
      throw new ResourceInstantiationException(e);
    }
    return this;
  }

  /**
   * Method is executed after the init() method has finished its
   * execution. <BR>
   * 
   * @throws ExecutionException
   */
  public void execute() throws ExecutionException {
    // lets start the progress and initialize the progress counter
    fireProgressChanged(0);

    // If no document provided to process throw an exception
    if(document == null) {
      fireProcessFinished();
      throw new ExecutionException("No document to process!");
    }

    // get the annotationSet name provided by the user, or otherwise use
    // the default method
    AnnotationSet inputAs = (inputASName == null || inputASName.trim().length() == 0)
            ? document.getAnnotations()
            : document.getAnnotations(inputASName);

    if(inputAs.get("Token").isEmpty()) {
      throw new ExecutionException("no Token annotations found");
    }

    List<Annotation> tokenList = new ArrayList<Annotation>(inputAs.get("Token"));
    Collections.sort(tokenList, new OffsetComparator());
    String[] tokens = new String[tokenList.size()];
    for(int i = 0; i < tokenList.size(); i++) {
      Annotation ann = tokenList.get(i);
      try {
        tokens[i] = document.getContent().getContent(
                ann.getStartNode().getOffset(), ann.getEndNode().getOffset())
                .toString();
      }
      catch(InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }

    if(applicationMode == POSApplicationMode.FIRSTBEST) {
      String[] tags = firstBest(tokens, decoder);
      for(int m = 0; m < tags.length; m++) {
        tokenList.get(m).getFeatures().put("category", tags[m]);
      }
    }
    else if(applicationMode == POSApplicationMode.CONFIDENCE) {
      List<Map<String, Double>> tags = confidence(tokens, decoder);
      for(int m = 0; m < tags.size(); m++) {
        tokenList.get(m).getFeatures().put("category", tags.get(m));
      }
    }
    else {
      // key is the overall score for the tagset
      // value is the tagset for the entire document
      Map<Double, String[]> tags = nBest(tokens, decoder);
      for(Double score : tags.keySet()) {
        String[] theTags = tags.get(score);
        for(int m = 0; m < theTags.length; m++) {
          FeatureMap f = tokenList.get(m).getFeatures();
          Map<String, Set<Double>> scores = (Map<String, Set<Double>>)f.get("category");
          if(scores == null) {
            scores = new HashMap<String, Set<Double>>();
            f.put("category", scores);
          }
          
          Set<Double> vals = scores.get(theTags[m]);
          if(vals == null) {
            vals = new HashSet<Double>();
            f.put(theTags[m], vals);
          }
          vals.add(score);
        }
      }
    }

    // process finished, acknowledge user about this.
    fireProcessFinished();
  }

  /**
   * Obtains only the first best result.
   * @param tokens
   * @param decoder
   * @return an array of pos tags.
   */
  private String[] firstBest(String[] tokens, HmmDecoder decoder) {
    return decoder.firstBest(tokens);
  }

  /**
   * Obtains first five best outputs.
   * @param tokens
   * @param decoder
   * @return
   */
  private Map<Double, String[]> nBest(String[] tokens, HmmDecoder decoder) {
    Map<Double, String[]> toReturn = new HashMap<Double, String[]>();
    Iterator<ScoredObject<String[]>> nBestIt = decoder.nBest(tokens);
    for(int n = 0; n < nBest.intValue() && nBestIt.hasNext(); ++n) {
      ScoredObject<String[]> tagScores = (ScoredObject<String[]>)nBestIt.next();
      double score = tagScores.score();
      String[] tags = (String[])tagScores.getObject();
      toReturn.put(new Double(score), tags);
    }
    return toReturn;
  }

  /**
   * For every word, it obtains five pos tags and their confidence
   * @param tokens
   * @param decoder
   * @return
   */
  private List<Map<String, Double>> confidence(String[] tokens,
          HmmDecoder decoder) {
    List<Map<String, Double>> toReturn = new ArrayList<Map<String, Double>>();
    TagWordLattice lattice = decoder.lattice(tokens);
    for(int tokenIndex = 0; tokenIndex < tokens.length; ++tokenIndex) {
      List<ScoredObject<String>> tagScores = lattice
              .log2ConditionalTagList(tokenIndex);

      Map<String, Double> map = new HashMap<String, Double>();
      for(int i = 0; i < 5; ++i) {
        double logProb = tagScores.get(i).score();
        double conditionalProb = java.lang.Math.pow(2.0, logProb);
        String tag = tagScores.get(i).getObject();
        map.put(tag, new Double(conditionalProb));
      }
      toReturn.add(map);
    }
    return toReturn;
  }

  /**
   * Gets the url of the model used for pos tagging
   * @return
   */
  public URL getModelFileUrl() {
    return modelFileUrl;
  }

  /**
   * Sets the url of the model used for pos tagging
   * @param modelFileUrl
   */
  public void setModelFileUrl(URL modelFileUrl) {
    this.modelFileUrl = modelFileUrl;
  }

  /**
   * gets the name of the input annotation set with tokens in it
   * @return
   */
  public String getInputASName() {
    return inputASName;
  }

  /**
   * Sets the name of the input annotation set with tokens in it
   * @param inputAS
   */
  public void setInputASName(String inputASName) {
    this.inputASName = inputASName;
  }

  /**
   * Number of best results to obtain
   * @return
   */
  public int getNBest() {
    return nBest;
  }

  /**
   * Number of best results to obtain
   * @param best
   */
  public void setNBest(int best) {
    nBest = best;
  }

  /**
   * Gets the application mode in which the POS tagger should be run
   * @return
   */
  public POSApplicationMode getApplicationMode() {
    return applicationMode;
  }

  /**
   * Sets the application mode in which the POS tagger should be run
   * @param applicationMode
   */
  public void setApplicationMode(POSApplicationMode applicationMode) {
    this.applicationMode = applicationMode;
  }
}