Log in Help
Print
Homereleasesgate-8.4-build5748-ALLpluginsOpenNLPsrcgateopennlp 〉 OpenNlpTokenizer.java
 
/*
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: OpenNlpTokenizer.java 17967 2014-05-11 16:35:51Z ian_roberts $
 */
package gate.opennlp;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.text.NumberFormat;
import java.util.*;
import org.apache.log4j.Logger;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import gate.*;
import gate.creole.*;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.RunTime;
import gate.creole.metadata.Sharable;
import gate.util.InvalidOffsetException;

/**
 * Wrapper PR for the OpenNLP tokenizer.
 */
@CreoleResource(name = "OpenNLP Tokenizer", 
    comment = "Tokenizer using an OpenNLP maxent model",
    helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:opennlp")
public class OpenNlpTokenizer extends AbstractLanguageAnalyser {

  private static final long serialVersionUID = 6965074842061250720L;
  private static final Logger logger = Logger.getLogger(OpenNlpTokenizer.class);

  /* CREOLE PARAMETERS & WRAPPED COMPONENTS */
  private String annotationSetName = null;
  private URL modelUrl;
  private TokenizerME tokenizer = null;
  private TokenizerModel model = null;  

  
  public void execute() throws ExecutionException {
    interrupted = false;
    long startTime = System.currentTimeMillis();
    if(document == null) { throw new ExecutionException(
        "No document to process!"); }
    fireStatusChanged("Running " + this.getName() + " on " + document.getName());
    fireProgressChanged(0);

    AnnotationSet annotations = document.getAnnotations(annotationSetName);
    String text = document.getContent().toString();
    checkInterruption();
    Span[] spans = tokenizer.tokenizePos(text);

    /*
     * The spans ought to be ordered, but the OpenNLP API is unclear. We need to
     * be sure they are in order so we can spot the gaps and put Space Token
     * annotations on them.
     */
    Arrays.sort(spans);
    int previousEnd = 0;

    for(Span span : spans) {
      checkInterruption();
      int tokenStart = span.getStart();
      int tokenEnd = span.getEnd();

      if(tokenStart > previousEnd) {
        FeatureMap sfm = Factory.newFeatureMap();
        sfm.put("source", "OpenNLP");
        sfm.put(TOKEN_STRING_FEATURE_NAME,
            text.substring(previousEnd, tokenStart));
        sfm.put(TOKEN_LENGTH_FEATURE_NAME, tokenStart - previousEnd);
        try {
          annotations.add((long)previousEnd, (long)tokenStart,
              SPACE_TOKEN_ANNOTATION_TYPE, sfm);
        } catch(InvalidOffsetException e) {
          throw new ExecutionException(e);
        }
      }

      previousEnd = tokenEnd;

      FeatureMap fm = Factory.newFeatureMap();
      fm.put("source", "OpenNLP");
      fm.put(TOKEN_STRING_FEATURE_NAME, text.substring(tokenStart, tokenEnd));
      fm.put(TOKEN_LENGTH_FEATURE_NAME, span.length());
      try {
        annotations.add((long)tokenStart, (long)tokenEnd,
            TOKEN_ANNOTATION_TYPE, fm);
      } catch(InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }

    fireProcessFinished();
    fireStatusChanged("Finished "
        + this.getName()
        + " on "
        + document.getName()
        + " in "
        + NumberFormat.getInstance().format(
            (double)(System.currentTimeMillis() - startTime) / 1000)
        + " seconds!");
  }

  
  private void checkInterruption() throws ExecutionInterruptedException {
    if(isInterrupted()) { throw new ExecutionInterruptedException(
        "Execution of " + this.getName() + " has been abruptly interrupted!"); }
  }

  
  public Resource init() throws ResourceInstantiationException {
    if(model == null) {
      InputStream modelInput = null;
      try {
        modelInput = modelUrl.openStream();
        this.model = new TokenizerModel(modelInput);
        logger.info("OpenNLP Tokenizer: " + modelUrl.toString());
      } catch(IOException e) {
        throw new ResourceInstantiationException(e);
      } finally {
        if(modelInput != null) {
          try {
            modelInput.close();
          } catch(IOException e) {
            throw new ResourceInstantiationException(e);
          }
        }
      }
    }
    this.tokenizer = new TokenizerME(model);

    super.init();
    return this;
  }
  

  public void reInit() throws ResourceInstantiationException {
    model = null;
    init();
  }

  
  /* CREOLE PARAMETERS */

  @RunTime
  @CreoleParameter(defaultValue = "",
      comment = "Output AS for Tokens")
  public void setAnnotationSetName(String a) {
    annotationSetName = a;
  }

  public String getAnnotationSetName() {
    return annotationSetName;
  }
  
  
  @CreoleParameter(defaultValue = "models/english/en-token.bin",
      comment = "location of the tokenizer model")
  public void setModel(URL model) {
    this.modelUrl = model;
  }
  
  public URL getModel() {
    return modelUrl;
  }

  /**
   * For internal use by the duplication mechanism.
   */
  @Sharable
  public void setTokenizerModel(TokenizerModel model) {
    this.model = model;
  }
  
  /**
   * For internal use by the duplication mechanism.
   */
  public TokenizerModel getTokenizerModel() {
    return model;
  }
}