GATE.ac.uk - releases/gate-8.4-build5748-ALL/plugins/GENIA/src/gate/creole/genia/splitter/GENIASentenceSplitter.java

/*
 * GENIASentenceSplitter
 * 
 * Copyright (c) 2011, The University of Sheffield.
 * 
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * licenced under the GNU Library General Public License, Version 3, June 2007
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 * 
 * Mark A. Greenwood, 01/06/2011
 */
package gate.creole.genia.splitter;

import gate.AnnotationSet;
import gate.Factory;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.Files;
import gate.util.ProcessManager;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.net.URL;

import org.apache.commons.io.IOUtils;

@CreoleResource(name = "GENIA Sentence Splitter", icon = "sentence-splitter.png", helpURL = "http://gate.ac.uk/userguide/sec:domain-creole:biomed:genia")
public class GENIASentenceSplitter extends AbstractLanguageAnalyser {

  private static final long serialVersionUID = 142837355544158905L;

  private boolean debug = false;

  private String annotationSetName;

  private URL splitterBinary;

  private ProcessManager manager = new ProcessManager();

  public Boolean getDebug() {
    return debug;
  }

  @RunTime
  @CreoleParameter(defaultValue = "false", comment = "if true then debugging info is reported")
  public void setDebug(Boolean debug) {
    this.debug = debug;
  }

  public URL getSplitterBinary() {
    return splitterBinary;
  }

  @RunTime
  @CreoleParameter(comment = "the URL of the GENIA sentence splitter binary")
  public void setSplitterBinary(URL splitterBinary) {
    this.splitterBinary = splitterBinary;
  }

  public String getAnnotationSetName() {
    return annotationSetName;
  }

  @RunTime
  @Optional
  @CreoleParameter(comment = "the annotation set in which Sentence annotations will be created")
  public void setAnnotationSetName(String annotationSetName) {
    this.annotationSetName = annotationSetName;
  }

  public void execute() throws ExecutionException {

    // get the sentence splitter file from the URL provided
    File splitter = Files.fileFromURL(splitterBinary);

    // get the document content and replace non-breaking spaces with spaces
    // TODO replace new-lines with spaces so we don't get a sentence per line
    String docContent =
            document.getContent().toString().replace((char)160, ' ');

    //The reader that brings back the output from the sentence splitter
    BufferedReader in = null;
    
    try {
      // create temporary files to use with the external sentence splitter
      File tmpIn = File.createTempFile("GENIA", ".txt");
      File tmpOut = File.createTempFile("GENIA", ".txt");

      // store the document content in the input file
      FileOutputStream fos = new FileOutputStream(tmpIn);
      fos.write(docContent.getBytes("utf8"));
      fos.close();

      // setup the command line to run the sentence splitter
      String[] args =
              new String[]{splitter.getAbsolutePath(), tmpIn.getAbsolutePath(),
                  tmpOut.getAbsolutePath()};

      // run the sentence splitter over the docuement
      manager.runProcess(args, splitter.getParentFile(), (debug
              ? System.out
              : null), (debug ? System.err : null));

      // get the annotation set we are going to store results in
      AnnotationSet annotationSet = document.getAnnotations(annotationSetName);

      // we haven't found any sentence yet so start looking for the next one
      // from the beginning of the document
      int end = 0;

      // read in the output from the sentence splitter one line at a time
      in = new BufferedReader(new FileReader(tmpOut));
      String sentence = in.readLine();
      while(sentence != null) {

        // trim the sentence so we don't annotate extranious white space,
        // this isn't python code after all :)
        sentence = sentence.trim();

        //find the start of the sentence
        //TODO throw a sensible exception if the sentence can't be found?
        int start = docContent.indexOf(sentence, end);

        //work out where the sentence ends
        end = start + sentence.length();

        if(end > start) {
          //the sentence has a length so annotate it 
          annotationSet.add((long)start, (long)end, "Sentence",
                  Factory.newFeatureMap());
        }

        //get the next line from the output from the tagger
        sentence = in.readLine();
      }

      //delete the temp files
      if(!debug && !tmpIn.delete()) tmpIn.deleteOnExit();
      if(!debug && !tmpOut.delete()) tmpOut.deleteOnExit();

    } catch(Exception ioe) {
      throw new ExecutionException("An error occured running the splitter", ioe);
    }
    finally {
      IOUtils.closeQuietly(in);
    }
  }
}