GATE.ac.uk - releases/gate-8.4-build5748-ALL/plugins/OpenNLP/src/gate/opennlp/OpenNlpSentenceSplit.java

/*
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: OpenNlpSentenceSplit.java 17967 2014-05-11 16:35:51Z ian_roberts $
 */
package gate.opennlp;

import gate.*;
import gate.creole.*;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.RunTime;
import gate.creole.metadata.Sharable;
import gate.util.InvalidOffsetException;
import java.io.*;
import java.net.URL;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.regex.*;
import opennlp.tools.sentdetect.*;
import opennlp.tools.util.Span;
import org.apache.log4j.Logger;


/**
 * Wrapper PR for the OpenNLP sentence splitter.
 */
@CreoleResource(name = "OpenNLP Sentence Splitter", 
    comment = "Sentence splitter using an OpenNLP maxent model",
    helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:opennlp")
public class OpenNlpSentenceSplit extends AbstractLanguageAnalyser {

  
  private static final long serialVersionUID = 3833973991517701119L;
  private static final Logger logger = Logger.getLogger(OpenNlpSentenceSplit.class);

  
  /* CREOLE PARAMETERS & WRAPPED COMPONENTS */
  private String annotationSetName = null;
  private URL modelUrl;
  private SentenceDetectorME splitter = null;
  private SentenceModel model = null;
  
  private Pattern punctEnding;
  

  @Override
  public void execute() throws ExecutionException {
    interrupted = false;
    long startTime = System.currentTimeMillis();
    if(document == null) { throw new ExecutionException(
        "No document to process!"); }
    fireStatusChanged("Running " + this.getName() + " on " + document.getName());
    fireProgressChanged(0);

    AnnotationSet annotations = document.getAnnotations(annotationSetName);
    String text = document.getContent().toString();
    checkInterruption();
    Span[] spans = splitter.sentPosDetect(text);
    Arrays.sort(spans);

    for(int i = 0; i < spans.length; i++) {
      Span span = spans[i];
      checkInterruption();
      FeatureMap fm = Factory.newFeatureMap();
      fm.put("source", "OpenNLP");
      FeatureMap sfm = Factory.newFeatureMap();
      sfm.put("source", "OpenNLP");
      long start = (long)span.getStart();
      long end = (long)span.getEnd();
      long splitStart, splitEnd;
      String splitKind;

      /*
       * If the Sentence ends with 1+ punctuation marks, cover them with an
       * internal Split. Otherwise, put an external Split between this Sentence
       * and the next one (or the document end, if this is the last Sentence ---
       * this can produce a 0-width Split, sorry).
       */
      String sentenceContent = Utils.stringFor(document, start, end);
      Matcher matcher = punctEnding.matcher(sentenceContent);
      if(matcher.find()) {
        splitStart = start + (long)matcher.start();
        splitEnd = end;
        splitKind = "internal";
      } else {
        splitStart = end;
        splitEnd = nextStart(spans, i);
        splitKind = "external";
      }

      try {
        annotations.add(start, end, SENTENCE_ANNOTATION_TYPE, fm);
        sfm.put("kind", splitKind);
        annotations.add(splitStart, splitEnd, "Split", sfm);
      } catch(InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }

    fireProcessFinished();
    fireStatusChanged("Finished "
        + this.getName()
        + " on "
        + document.getName()
        + " in "
        + NumberFormat.getInstance().format(
            (double)(System.currentTimeMillis() - startTime) / 1000)
        + " seconds!");
  }

  private long nextStart(Span[] spans, int currentSpan) {
    if(currentSpan < (spans.length - 1)) { return spans[currentSpan + 1]
        .getStart(); }
    // implied else: we're working on the last sentence
    return this.document.getContent().size();
  }

  @Override
  public Resource init() throws ResourceInstantiationException {
    /* TODO: This isn't perfect; it matches all punctuation at the
     * end of a Sentence annotation, e.g., ").".     */
    punctEnding = Pattern.compile("\\p{Punct}+$");

    if(model == null) {
      InputStream modelInput = null;
      try {
        modelInput = modelUrl.openStream();
        this.model = new SentenceModel(modelInput);
        logger.info("OpenNLP Splitter: " + modelUrl.toString());
      } catch(IOException e) {
        throw new ResourceInstantiationException(e);
      } finally {
        if(modelInput != null) {
          try {
            modelInput.close();
          } catch(IOException e) {
            throw new ResourceInstantiationException(e);
          }
        }
      }
    }
    this.splitter = new SentenceDetectorME(model);

    super.init();
    return this;
  }

  
  @Override
  public void reInit() throws ResourceInstantiationException {
    model = null;
    init();
  }

  
  private void checkInterruption() throws ExecutionInterruptedException {
    if(isInterrupted()) { throw new ExecutionInterruptedException(
        "Execution of " + this.getName() + " has been abruptly interrupted!"); }
  }
	
  
	/* CREOLE PARAMETERS */

  @RunTime
  @CreoleParameter(defaultValue = "",
      comment = "annotation set for Sentences")
  public void setAnnotationSetName(String a) {
    annotationSetName = a;
  }

  public String getAnnotationSetName() {
    return annotationSetName;
  }
  
  @CreoleParameter(defaultValue = "models/english/en-sent.bin",
      comment = "location of the splitter model")
  public void setModel(URL model) {
    this.modelUrl = model;
  }

  public URL getModel() {
    return modelUrl;
  }

  /**
   * For internal use by the duplication mechanism.
   */
  @Sharable
  public void setSentenceModel(SentenceModel model) {
    this.model = model;
  }

  /**
   * For internal use by the duplication mechanism.
   */
  public SentenceModel getSentenceModel() {
    return model;
  }

}