Log in Help
Print
Homereleasesgate-7.1-build4485-ALLpluginsOpenNLPsrcgateopennlp 〉 OpenNlpTokenizer.java
 
/*
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: OpenNlpTokenizer.java 16275 2012-11-14 12:52:58Z adamfunk $
 */
package gate.opennlp;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.text.NumberFormat;
import java.util.*;
import org.apache.log4j.Logger;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import gate.*;
import gate.creole.*;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;

/**
 * Wrapper PR for the OpenNLP tokenizer.
 */
@CreoleResource(name = "OpenNLP Tokenizer", 
    comment = "Tokenizer using an OpenNLP maxent model",
    helpURL = "http://gate.ac.uk/sale/tao/splitch21.html#sec:misc-creole:opennlp")
public class OpenNlpTokenizer extends AbstractLanguageAnalyser {

  private static final long serialVersionUID = 6965074842061250720L;
  private static final Logger logger = Logger.getLogger(OpenNlpTokenizer.class);

  /* CREOLE PARAMETERS & WRAPPED COMPONENTS */
  private String annotationSetName = null;
  private URL modelUrl;
  private TokenizerME tokenizer = null;
  private TokenizerModel model = null;	

  
  public void execute() throws ExecutionException {
    interrupted = false;
    long startTime = System.currentTimeMillis();
    if(document == null) { throw new ExecutionException(
        "No document to process!"); }
    fireStatusChanged("Running " + this.getName() + " on " + document.getName());
    fireProgressChanged(0);

    AnnotationSet annotations = document.getAnnotations(annotationSetName);
    String text = document.getContent().toString();
    checkInterruption();
    Span[] spans = tokenizer.tokenizePos(text);

    /*
     * The spans ought to be ordered, but the OpenNLP API is unclear. We need to
     * be sure they are in order so we can spot the gaps and put Space Token
     * annotations on them.
     */
    Arrays.sort(spans);
    int previousEnd = 0;

    for(Span span : spans) {
      checkInterruption();
      int tokenStart = span.getStart();
      int tokenEnd = span.getEnd();

      if(tokenStart > previousEnd) {
        FeatureMap sfm = Factory.newFeatureMap();
        sfm.put("source", "OpenNLP");
        sfm.put(TOKEN_STRING_FEATURE_NAME,
            text.substring(previousEnd, tokenStart));
        sfm.put(TOKEN_LENGTH_FEATURE_NAME, tokenStart - previousEnd);
        try {
          annotations.add((long)previousEnd, (long)tokenStart,
              SPACE_TOKEN_ANNOTATION_TYPE, sfm);
        } catch(InvalidOffsetException e) {
          throw new ExecutionException(e);
        }
      }

      previousEnd = tokenEnd;

      FeatureMap fm = Factory.newFeatureMap();
      fm.put("source", "OpenNLP");
      fm.put(TOKEN_STRING_FEATURE_NAME, text.substring(tokenStart, tokenEnd));
      fm.put(TOKEN_LENGTH_FEATURE_NAME, span.length());
      try {
        annotations.add((long)tokenStart, (long)tokenEnd,
            TOKEN_ANNOTATION_TYPE, fm);
      } catch(InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }

    fireProcessFinished();
    fireStatusChanged("Finished "
        + this.getName()
        + " on "
        + document.getName()
        + " in "
        + NumberFormat.getInstance().format(
            (double)(System.currentTimeMillis() - startTime) / 1000)
        + " seconds!");
  }

  
  private void checkInterruption() throws ExecutionInterruptedException {
    if(isInterrupted()) { throw new ExecutionInterruptedException(
        "Execution of " + this.getName() + " has been abruptly interrupted!"); }
  }

  
  public Resource init() throws ResourceInstantiationException {
    InputStream modelInput = null;
    try {
      modelInput = modelUrl.openStream();
      this.model = new TokenizerModel(modelInput);
      this.tokenizer = new TokenizerME(model);
      logger.info("OpenNLP Tokenizer: " + modelUrl.toString());
    } catch(IOException e) {
      throw new ResourceInstantiationException(e);
    } finally {
      if(modelInput != null) {
        try {
          modelInput.close();
        } catch(IOException e) {
          throw new ResourceInstantiationException(e);
        }
      }
    }

    super.init();
    return this;
  }
  

  public void reInit() throws ResourceInstantiationException {
    init();
  }

  
	/* CREOLE PARAMETERS */

	@RunTime
	@CreoleParameter(defaultValue = "",
	    comment = "Output AS for Tokens")
	public void setAnnotationSetName(String a) {
		annotationSetName = a;
	}

	public String getAnnotationSetName() {
		return annotationSetName;
	}
	
	
	@CreoleParameter(defaultValue = "models/english/en-token.bin",
	    comment = "location of the tokenizer model")
  public void setModel(URL model) {
    this.modelUrl = model;
  }
  
	public URL getModel() {
		return modelUrl;
	}

}