GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/LingPipe/src/gate/lingpipe/LanguageIdentifierPR.java

package gate.lingpipe;

import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.GateRuntimeException;

import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;

import com.aliasi.classify.Classification;
import com.aliasi.classify.Classifier;
import com.aliasi.util.AbstractExternalizable;

/**
 * A Processing resource to identify language of the document based on
 * LingPipe language identifier classifier. Please download appropriate
 * models from the LingPipe website. see
 * http://alias-i.com/lingpipe/web/models.html
 * 
 * The default model supplied with GATE distribution is based on the
 * Leipzig corpora collection that consists of the data in the following
 * languages: Catalan (cat), Danish (dk), English (en), Estonian (ee),
 * Finnish (fi), French (fr), German (de), Italian (it), Japanese (jp),
 * Korean (kr), Norwegian (no), Sorbian (sorb), Swedish (se), and
 * Turkish (tr).
 * 
 * Should you want to train models on other languages or different
 * dataset, please refer to the URL: *
 * http://alias-i.com/lingpipe/demos/tutorial/langid/read-me.html.
 * 
 * @author niraj
 * 
 */
public class LanguageIdentifierPR extends AbstractLanguageAnalyser implements
                                                                  ProcessingResource {

  /** File which cotains model for NE */
  protected URL modelFileUrl;

  /** Model file extracted from the URL */
  protected File modelFile;

  /** classifier object */
  protected Classifier<CharSequence, Classification> classifier;

  /** document feature name */
  protected String languageIdFeatureName;

  /**
   * Initializes this resource
   * 
   * @return Resource
   * @throws ResourceInstantiationException
   */
  public Resource init() throws ResourceInstantiationException {
    if(modelFileUrl == null)
      throw new ResourceInstantiationException("No model file provided!");

    try {
      modelFile = new File(modelFileUrl.toURI());
    }
    catch(URISyntaxException e) {
      throw new ResourceInstantiationException(e);
    }

    if(modelFile == null || !modelFile.exists()) {
      throw new ResourceInstantiationException("modelFile:"
              + modelFileUrl.toString() + " does not exists");
    }

    try {
      classifier = (Classifier<CharSequence, Classification>)AbstractExternalizable
              .readObject(modelFile);
    }
    catch(IOException e) {
      throw new ResourceInstantiationException(e);
    }
    catch(ClassNotFoundException e) {
      throw new ResourceInstantiationException(e);
    }

    return this;
  }

  /**
   * Method is executed after the init() method has finished its
   * execution. <BR>
   * 
   * @throws ExecutionException
   */
  public void execute() throws ExecutionException {
    // lets start the progress and initialize the progress counter
    fireProgressChanged(0);

    // If no document provided to process throw an exception
    if(document == null) {
      fireProcessFinished();
      throw new GateRuntimeException("No document to process!");
    }

    // langugage ID feature Name
    if(languageIdFeatureName == null
            || languageIdFeatureName.trim().length() == 0)
      languageIdFeatureName = "lang";

    String docText = document.getContent().toString();
    Classification classification = classifier.classify(docText);

    document.getFeatures().put(languageIdFeatureName,
            classification.bestCategory());

    // process finished, acknowledge user about this.
    fireProcessFinished();
  }

  /**
   * gets the model to be used for identifying language of the document
   * 
   * @return
   */
  public URL getModelFileUrl() {
    return modelFileUrl;
  }

  /**
   * sets the model to be used for identifying language of the document
   * 
   * @param modelFileUrl
   */
  public void setModelFileUrl(URL modelFileUrl) {
    this.modelFileUrl = modelFileUrl;
  }

  /**
   * gets name of the feature which is used for storing the identified
   * language
   * 
   * @return
   */
  public String getLanguageIdFeatureName() {
    return languageIdFeatureName;
  }

  /**
   * sets name of the feature that should be used for storing the
   * identified language of the document
   * 
   * @param languageIdFeatureName
   */
  public void setLanguageIdFeatureName(String languageIdFeatureName) {
    this.languageIdFeatureName = languageIdFeatureName;
  }

}