package gate.lingpipe;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunker;
import com.aliasi.chunk.Chunking;
import com.aliasi.util.AbstractExternalizable;
/**
* This PR is used for recognizing named entities such as location,
* organizaiton etc. It uses the LingPipe models to achieve that.
*
* @author niraj
*
*/
public class NamedEntityRecognizerPR extends AbstractLanguageAnalyser implements
ProcessingResource {
/** File which cotains model for NE */
protected URL modelFileUrl;
/** Model file extracted from the URL */
protected File modelFile;
/** The name of the annotation set used for input */
protected String outputASName;
/** Chunker object */
protected Chunker chunker;
/**
* Initializes this resource
*
* @return Resource
* @throws ResourceInstantiationException
*/
public Resource init() throws ResourceInstantiationException {
if(modelFileUrl == null)
throw new ResourceInstantiationException("No model file provided!");
try {
modelFile = new File(modelFileUrl.toURI());
}
catch(URISyntaxException e) {
throw new ResourceInstantiationException(e);
}
if(modelFile == null || !modelFile.exists()) {
throw new ResourceInstantiationException("modelFile:"
+ modelFileUrl.toString() + " does not exists");
}
try {
chunker = (Chunker)AbstractExternalizable.readObject(modelFile);
}
catch(IOException e) {
throw new ResourceInstantiationException(e);
}
catch(ClassNotFoundException e) {
throw new ResourceInstantiationException(e);
}
return this;
}
/**
* Method is executed after the init() method has finished its
* execution. <BR>
*
* @throws ExecutionException
*/
public void execute() throws ExecutionException {
// lets start the progress and initialize the progress counter
fireProgressChanged(0);
// If no document provided to process throw an exception
if(document == null) {
fireProcessFinished();
throw new GateRuntimeException("No document to process!");
}
// get the annotationSet name provided by the user, or otherwise use
// the
// default method
AnnotationSet outputAs = (outputASName == null || outputASName.trim()
.length() == 0) ? document.getAnnotations() : document
.getAnnotations(outputASName);
try {
String docText = document.getContent().toString();
Chunking chunking = chunker.chunk(docText);
for(Chunk c : chunking.chunkSet()) {
FeatureMap fm = gate.Factory.newFeatureMap();
outputAs.add(new Long(c.start()), new Long(c.end()), c.type(), fm);
}
}
catch(InvalidOffsetException e) {
throw new ExecutionException(e);
}
// process finished, acknowledge user about this.
fireProcessFinished();
}
/**
* Returns the name of the AnnotationSet that has been provided to
* create the AnnotationSet
*/
public String getOutputASName() {
return outputASName;
}
/**
* Sets the AnnonationSet name, that is used to create the
* AnnotationSet
*
* @param annotationSetName
*/
public void setOutputASName(String outputAS) {
this.outputASName = outputAS;
}
/**
* gets the url of the model used for recognizing named entiries in
* the document.
*
* @return
*/
public URL getModelFileUrl() {
return modelFileUrl;
}
/**
* sets the url of the model used for recognizing named entiries in
* the document.
*
* @param modelFileUrl
*/
public void setModelFileUrl(URL modelFileUrl) {
this.modelFileUrl = modelFileUrl;
}
}