Log in Help
Print
Homereleasesgate-5.1-beta2-build3402-ALLpluginsLingPipesrcgatelingpipe 〉 NamedEntityRecognizerPR.java
 
package gate.lingpipe;

import gate.AnnotationSet;
import gate.FeatureMap;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;

import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunker;
import com.aliasi.chunk.Chunking;
import com.aliasi.util.AbstractExternalizable;

/**
 * This PR is used for recognizing named entities such as location,
 * organizaiton etc. It uses the LingPipe models to achieve that.
 * 
 * @author niraj
 * 
 */
public class NamedEntityRecognizerPR extends AbstractLanguageAnalyser implements
                                                                     ProcessingResource {

  /** File which cotains model for NE */
  protected URL modelFileUrl;

  /** Model file extracted from the URL */
  protected File modelFile;

  /** The name of the annotation set used for input */
  protected String outputASName;

  /** Chunker object */
  protected Chunker chunker;

  /**
   * Initializes this resource
   * 
   * @return Resource
   * @throws ResourceInstantiationException
   */
  public Resource init() throws ResourceInstantiationException {
    if(modelFileUrl == null)
      throw new ResourceInstantiationException("No model file provided!");

    try {
      modelFile = new File(modelFileUrl.toURI());
    }
    catch(URISyntaxException e) {
      throw new ResourceInstantiationException(e);
    }

    if(modelFile == null || !modelFile.exists()) {
      throw new ResourceInstantiationException("modelFile:"
              + modelFileUrl.toString() + " does not exists");
    }

    try {
      chunker = (Chunker)AbstractExternalizable.readObject(modelFile);
    }
    catch(IOException e) {
      throw new ResourceInstantiationException(e);
    }
    catch(ClassNotFoundException e) {
      throw new ResourceInstantiationException(e);
    }

    return this;
  }

  /**
   * Method is executed after the init() method has finished its
   * execution. <BR>
   * 
   * @throws ExecutionException
   */
  public void execute() throws ExecutionException {
    // lets start the progress and initialize the progress counter
    fireProgressChanged(0);

    // If no document provided to process throw an exception
    if(document == null) {
      fireProcessFinished();
      throw new GateRuntimeException("No document to process!");
    }

    // get the annotationSet name provided by the user, or otherwise use
    // the
    // default method
    AnnotationSet outputAs = (outputASName == null || outputASName.trim()
            .length() == 0) ? document.getAnnotations() : document
            .getAnnotations(outputASName);

    try {
      String docText = document.getContent().toString();
      Chunking chunking = chunker.chunk(docText);
      for(Chunk c : chunking.chunkSet()) {
        FeatureMap fm = gate.Factory.newFeatureMap();
        outputAs.add(new Long(c.start()), new Long(c.end()), c.type(), fm);
      }

    }
    catch(InvalidOffsetException e) {
      throw new ExecutionException(e);
    }

    // process finished, acknowledge user about this.
    fireProcessFinished();
  }

  /**
   * Returns the name of the AnnotationSet that has been provided to
   * create the AnnotationSet
   */
  public String getOutputASName() {
    return outputASName;
  }

  /**
   * Sets the AnnonationSet name, that is used to create the
   * AnnotationSet
   * 
   * @param annotationSetName
   */
  public void setOutputASName(String outputAS) {
    this.outputASName = outputAS;
  }

  /**
   * gets the url of the model used for recognizing named entiries in
   * the document.
   * 
   * @return
   */
  public URL getModelFileUrl() {
    return modelFileUrl;
  }

  /**
   * sets the url of the model used for recognizing named entiries in
   * the document.
   * 
   * @param modelFileUrl
   */
  public void setModelFileUrl(URL modelFileUrl) {
    this.modelFileUrl = modelFileUrl;
  }

}