Log in Help
Print
Homereleasesgate-8.4-build5748-ALLpluginsStanford_CoreNLPsrcgatestanford 〉 Tokenizer.java
 
/*
 * Copyright (c) 1995-2016, The University of Sheffield. See the file
 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 * 
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Leon Derczynski, 22 Oct 2013
 * 
 * $Id: Tokenizer.java 15468 2013-10-22 21:13:15Z $
 */

package gate.stanford;


import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import gate.AnnotationSet;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;
import gate.util.SimpleFeatureMapImpl;

import java.io.StringReader;
import java.text.NumberFormat;

import org.apache.log4j.Logger;

/**
 * This class is a wrapper for the Stanford Tokenizer v3.2.0.
 */
@CreoleResource(name = "Stanford PTB Tokenizer", comment = "Stanford Penn Treebank v3 Tokenizer, for English", icon = "tokeniser", helpURL="http://gate.ac.uk/userguide/sec:misc:creole:stanford")
public class Tokenizer extends AbstractLanguageAnalyser {

  private static final long serialVersionUID = -6001371186847970080L;

  public static final String TAG_DOCUMENT_PARAMETER_NAME = "document";

  public static final String TAG_INPUT_AS_PARAMETER_NAME = "inputASName";

  public static final String TAG_ENCODING_PARAMETER_NAME = "encoding";

  public static final String TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";

  public static final String TOKEN_LABEL = "tokenLabel";

  public static final String SPACE_LABEL = "spaceLabel";

  public static final String TOKEN_STRING_FEATURE = "string";

  @RunTime
  @Optional
  @CreoleParameter(comment = "Throw an exception when there are none of the required input annotations", defaultValue = "false")
  public void setFailOnMissingInputAnnotations(Boolean fail) {
    failOnMissingInputAnnotations = fail;
  }

  public Boolean getFailOnMissingInputAnnotations() {
    return failOnMissingInputAnnotations;
  }

  protected Boolean failOnMissingInputAnnotations = true;

  protected Logger logger = Logger.getLogger(this.getClass().getName());

  @Override
  public Resource init() throws ResourceInstantiationException {
    return this;
  }

  @Override
  public void reInit() throws ResourceInstantiationException {
    init();
  }

  @Override
  public void execute() throws ExecutionException {
    // check the parameters
    if(document == null)
      throw new ExecutionException("No document to process!");

    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);


    long startTime = System.currentTimeMillis();
    fireStatusChanged("Tokenising " + document.getName());
    fireProgressChanged(0); 


    // tokenising goes here
    String rawText = "";
    try {
      rawText = document.getContent().getContent(new Long(0), document.getContent().size()).toString();
    } catch (Exception e) {
      System.out.println("Document content offsets wrong: " + e);
    }

    PTBTokenizer<CoreLabel> ptbt;
    try {
      ptbt = new PTBTokenizer<CoreLabel>(new StringReader(rawText), new CoreLabelTokenFactory(), "invertible=true");
    } catch (Exception e) {
      System.out.println("Failed when calling tokenizer: " + e);
      return;
    }

    Long tokenStart;
    Long tokenEnd;
    Long prevTokenEnd = new Long(0); // this default value lets us capture leading spaces

    for (CoreLabel label; ptbt.hasNext(); ) {
      label = ptbt.next();
      tokenStart = new Long(label.beginPosition());
      tokenEnd = new Long(label.endPosition());


      SimpleFeatureMapImpl tokenMap = new SimpleFeatureMapImpl();

      // add the token annotation
      try {
        tokenMap.put(TOKEN_STRING_FEATURE, document.getContent().getContent(tokenStart, tokenEnd).toString());
        outputAS.add(tokenStart, tokenEnd, tokenLabel, tokenMap);
      } catch (InvalidOffsetException e) {
        System.out.println("Token alignment problem:" + e);
      }

      // do we need to add a space annotation?
      if (tokenStart > prevTokenEnd) {
        try {
          outputAS.add(prevTokenEnd, tokenStart, spaceLabel, new SimpleFeatureMapImpl());
        } catch (InvalidOffsetException e) {
          System.out.println("Space token alignment problem:" + e);
        }

      }

      prevTokenEnd = tokenEnd;

    }


    fireProcessFinished();
    fireStatusChanged(document.getName() +
      " tokenised in " +
      NumberFormat.getInstance().format(
        (double)(System.currentTimeMillis() - startTime) / 1000) +
      " seconds!");
  }

  public void setEncoding(String encoding) {
    this.encoding = encoding;
  }

  @Optional
  @RunTime
  @CreoleParameter(comment = "Input annotation set name", defaultValue = "")
  public void setInputASName(String newInputASName) {
    inputASName = newInputASName;
  }

  public String getInputASName() {
    return inputASName;
  }

  public String getEncoding() {
    return this.encoding;
  }

  public String getOutputASName() {
    return this.outputASName;
  }

  @Optional
  @RunTime
  @CreoleParameter(comment = "Output annotation set name", defaultValue = "")
  public void setOutputASName(String outputASName) {
    this.outputASName = outputASName;
  }


  public String getTokenLabel() {
    return this.tokenLabel;
  }

  @Optional
  @RunTime
  @CreoleParameter(comment = "Annotation type for tokens", defaultValue = "Token")
  public void setTokenLabel(String tokenLabel) {
    this.tokenLabel = tokenLabel;
  }

  public String getSpaceLabel() {
    return this.spaceLabel;
  }

  @Optional
  @RunTime
  @CreoleParameter(comment = "Annotation type for spaces", defaultValue = "SpaceToken")
  public void setSpaceLabel(String spaceLabel) {
    this.spaceLabel = spaceLabel;
  }

  private String inputASName;

  private String encoding;

  private String outputASName;

  private String tokenLabel;

  private String spaceLabel;

}