GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/LingPipe/src/gate/lingpipe/TokenizerPR.java

package gate.lingpipe;

import gate.ProcessingResource;
import gate.Resource;
import gate.creole.*;
import gate.util.*;
import gate.*;
import java.util.*;

import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;

/**
 * LingPipe Tokenizer PR.
 * 
 * @author Ekaterina Stambolieva
 * 
 */
public class TokenizerPR extends AbstractLanguageAnalyser implements
                                                         ProcessingResource {
  /**
   * Name of the output annotation set
   */
  private String outputASName;

  /**
   * tokeniser factory used for tokenizing text
   */
  private com.aliasi.tokenizer.TokenizerFactory tf;

  /** Initialise this resource, and return it. */
  public Resource init() throws ResourceInstantiationException {

    // construct tokenizer
    tf = IndoEuropeanTokenizerFactory.INSTANCE;
    return this;
  }

  public void reInit() throws ResourceInstantiationException {
    init();
  }

  /**
   * execute method. Makes LingPipe API calls to tokenize the document.
   * It uses the document's string and passes it over to the LingPipe to
   * tokenize. It also generates space tokens as well.
   */
  public void execute() throws ExecutionException {

    if(document == null) {
      throw new ExecutionException("There is no loaded document");
    }

    super.fireProgressChanged(0);

    long startOffset = 0, endOffset = 0;
    AnnotationSet as = null;
    if(outputASName == null || outputASName.trim().length() == 0)
      as = document.getAnnotations();
    else as = document.getAnnotations(outputASName);

    String docContent = document.getContent().toString();

    List<String> tokenList = new ArrayList<String>();
    List<String> whiteList = new ArrayList<String>();
    Tokenizer tokenizer = tf.tokenizer(docContent.toCharArray(), 0, docContent
            .length());
    tokenizer.tokenize(tokenList, whiteList);

    for(int i = 0; i < whiteList.size(); i++) {
      try {

        startOffset = endOffset;
        endOffset = startOffset + whiteList.get(i).length();
        if((endOffset - startOffset) != 0) {
          FeatureMap fmSpaces = Factory.newFeatureMap();
          fmSpaces.put("length", "" + (endOffset - startOffset));
          as.add(new Long(startOffset), new Long(endOffset), "SpaceToken",
                  fmSpaces);
        }

        if(i < tokenList.size()) {
          startOffset = endOffset;
          endOffset = startOffset + tokenList.get(i).length();
          FeatureMap fmTokens = Factory.newFeatureMap();
          fmTokens.put("length", "" + (endOffset - startOffset));
          as.add(new Long(startOffset), new Long(endOffset), "Token", fmTokens);
        }
      }
      catch(InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }
  }

  /**
   * gets name of the output annotation set
   * @return
   */
  public String getOutputASName() {
    return outputASName;
  }

  /**
   * sets name of the output annotaiton set
   * @param outputAS
   */
  public void setOutputASName(String outputAS) {
    this.outputASName = outputAS;
  }
}