package gate.lingpipe;
import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.hmm.TagWordLattice;
import com.aliasi.util.ScoredObject;
import com.aliasi.util.Streams;
/**
* POS tagger based on the LingPipe library.
* @author gate
*
*/
public class POSTaggerPR extends AbstractLanguageAnalyser implements
ProcessingResource {
/** File which cotains model for NE */
protected URL modelFileUrl;
/** Model file extracted from the URL */
protected File modelFile;
/** Model decoder object */
protected HmmDecoder decoder;
/** The name of the annotation set used for input */
protected String inputASName;
/**
* Number of best results to obtain from the model
*/
protected Integer nBest = 5;
/**
* The application mode
*/
POSApplicationMode applicationMode;
/**
* Initializes this resource
*
* @return Resource
* @throws ResourceInstantiationException
*/
public Resource init() throws ResourceInstantiationException {
if(modelFileUrl == null)
throw new ResourceInstantiationException("No model file provided!");
try {
modelFile = new File(modelFileUrl.toURI());
}
catch(URISyntaxException e) {
throw new ResourceInstantiationException(e);
}
if(modelFile == null || !modelFile.exists()) {
throw new ResourceInstantiationException("modelFile:"
+ modelFileUrl.toString() + " does not exists");
}
try {
FileInputStream fileIn = new FileInputStream(modelFile);
ObjectInputStream objIn = new ObjectInputStream(fileIn);
HiddenMarkovModel hmm = (HiddenMarkovModel)objIn.readObject();
Streams.closeInputStream(objIn);
decoder = new HmmDecoder(hmm);
}
catch(IOException ioe) {
throw new ResourceInstantiationException(ioe);
}
catch(ClassNotFoundException e) {
throw new ResourceInstantiationException(e);
}
return this;
}
/**
* Method is executed after the init() method has finished its
* execution. <BR>
*
* @throws ExecutionException
*/
public void execute() throws ExecutionException {
// lets start the progress and initialize the progress counter
fireProgressChanged(0);
// If no document provided to process throw an exception
if(document == null) {
fireProcessFinished();
throw new ExecutionException("No document to process!");
}
// get the annotationSet name provided by the user, or otherwise use
// the default method
AnnotationSet inputAs = (inputASName == null || inputASName.trim().length() == 0)
? document.getAnnotations()
: document.getAnnotations(inputASName);
if(inputAs.get("Token").isEmpty()) {
throw new ExecutionException("no Token annotations found");
}
List<Annotation> tokenList = new ArrayList<Annotation>(inputAs.get("Token"));
Collections.sort(tokenList, new OffsetComparator());
String[] tokens = new String[tokenList.size()];
for(int i = 0; i < tokenList.size(); i++) {
Annotation ann = tokenList.get(i);
try {
tokens[i] = document.getContent().getContent(
ann.getStartNode().getOffset(), ann.getEndNode().getOffset())
.toString();
}
catch(InvalidOffsetException e) {
throw new ExecutionException(e);
}
}
if(applicationMode == POSApplicationMode.FIRSTBEST) {
String[] tags = firstBest(tokens, decoder);
for(int m = 0; m < tags.length; m++) {
tokenList.get(m).getFeatures().put("category", tags[m]);
}
}
else if(applicationMode == POSApplicationMode.CONFIDENCE) {
List<Map<String, Double>> tags = confidence(tokens, decoder);
for(int m = 0; m < tags.size(); m++) {
tokenList.get(m).getFeatures().put("category", tags.get(m));
}
}
else {
// key is the overall score for the tagset
// value is the tagset for the entire document
Map<Double, String[]> tags = nBest(tokens, decoder);
for(Double score : tags.keySet()) {
String[] theTags = tags.get(score);
for(int m = 0; m < theTags.length; m++) {
FeatureMap f = tokenList.get(m).getFeatures();
Map<String, Set<Double>> scores = (Map<String, Set<Double>>)f.get("category");
if(scores == null) {
scores = new HashMap<String, Set<Double>>();
f.put("category", scores);
}
Set<Double> vals = scores.get(theTags[m]);
if(vals == null) {
vals = new HashSet<Double>();
f.put(theTags[m], vals);
}
vals.add(score);
}
}
}
// process finished, acknowledge user about this.
fireProcessFinished();
}
/**
* Obtains only the first best result.
* @param tokens
* @param decoder
* @return an array of pos tags.
*/
private String[] firstBest(String[] tokens, HmmDecoder decoder) {
return decoder.firstBest(tokens);
}
/**
* Obtains first five best outputs.
* @param tokens
* @param decoder
* @return
*/
private Map<Double, String[]> nBest(String[] tokens, HmmDecoder decoder) {
Map<Double, String[]> toReturn = new HashMap<Double, String[]>();
Iterator<ScoredObject<String[]>> nBestIt = decoder.nBest(tokens);
for(int n = 0; n < nBest.intValue() && nBestIt.hasNext(); ++n) {
ScoredObject<String[]> tagScores = (ScoredObject<String[]>)nBestIt.next();
double score = tagScores.score();
String[] tags = (String[])tagScores.getObject();
toReturn.put(new Double(score), tags);
}
return toReturn;
}
/**
* For every word, it obtains five pos tags and their confidence
* @param tokens
* @param decoder
* @return
*/
private List<Map<String, Double>> confidence(String[] tokens,
HmmDecoder decoder) {
List<Map<String, Double>> toReturn = new ArrayList<Map<String, Double>>();
TagWordLattice lattice = decoder.lattice(tokens);
for(int tokenIndex = 0; tokenIndex < tokens.length; ++tokenIndex) {
List<ScoredObject<String>> tagScores = lattice
.log2ConditionalTagList(tokenIndex);
Map<String, Double> map = new HashMap<String, Double>();
for(int i = 0; i < 5; ++i) {
double logProb = tagScores.get(i).score();
double conditionalProb = java.lang.Math.pow(2.0, logProb);
String tag = tagScores.get(i).getObject();
map.put(tag, new Double(conditionalProb));
}
toReturn.add(map);
}
return toReturn;
}
/**
* Gets the url of the model used for pos tagging
* @return
*/
public URL getModelFileUrl() {
return modelFileUrl;
}
/**
* Sets the url of the model used for pos tagging
* @param modelFileUrl
*/
public void setModelFileUrl(URL modelFileUrl) {
this.modelFileUrl = modelFileUrl;
}
/**
* gets the name of the input annotation set with tokens in it
* @return
*/
public String getInputASName() {
return inputASName;
}
/**
* Sets the name of the input annotation set with tokens in it
* @param inputAS
*/
public void setInputASName(String inputASName) {
this.inputASName = inputASName;
}
/**
* Number of best results to obtain
* @return
*/
public int getNBest() {
return nBest;
}
/**
* Number of best results to obtain
* @param best
*/
public void setNBest(int best) {
nBest = best;
}
/**
* Gets the application mode in which the POS tagger should be run
* @return
*/
public POSApplicationMode getApplicationMode() {
return applicationMode;
}
/**
* Sets the application mode in which the POS tagger should be run
* @param applicationMode
*/
public void setApplicationMode(POSApplicationMode applicationMode) {
this.applicationMode = applicationMode;
}
}