GATE.ac.uk - releases/gate-8.4-build5748-ALL/plugins/OpenNLP/src/gate/opennlp/Sentence.java

/*
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: Sentence.java 16275 2012-11-14 12:52:58Z adamfunk $
 */
package gate.opennlp;

import java.util.*;
import opennlp.tools.util.Span;
import gate.*;


public class Sentence 
  implements Iterable<Annotation> {

  private static final long serialVersionUID = -6202949266926641966L;
  private List<Annotation> tokens;
  private int size;
  private String[] strings;
  private String[] tags;
  private String stringFeature, categoryFeature;
  
  
  public Sentence(AnnotationSet annotations, String stringFeature, String categoryFeature) {
    this.stringFeature = stringFeature;
    this.categoryFeature = categoryFeature;
    this.tokens = Utils.inDocumentOrder(annotations);
    this.size = this.tokens.size();
    this.strings = new String[size];
    this.tags = new String[size];
    
    boolean tagged = (categoryFeature != null) && (categoryFeature.length() > 0);
    
    for (int i=0 ; i < size ; i++) {
      Annotation token = this.tokens.get(i);
      FeatureMap fm = token.getFeatures();

      if (fm.containsKey(this.stringFeature)) {
        this.strings[i] = fm.get(this.stringFeature).toString();
      }
      else {
        this.strings[i] = "";
      }

      if (tagged && fm.containsKey(this.categoryFeature)) {
        this.tags[i] = fm.get(this.categoryFeature).toString();
      }
      else {
        this.tags[i] = "";
      }
    }
  }
  
  
  /**
   * Start character offset of the last token in the given Span. Note that 
   * OpenNLP's chunker and NER return token offsets between tokens, so a 
   * chunk marked (1,3) covers the document content from the beginning 
   * of token 1 to the end of token 2.
   * Watch out for -1 (error code) output. 
   * @param i
   * @return the GATE-style start offset in characters
   */
  public long getStartOffset(Span span) {
    int start = span.getStart();
    if ( (start >= 0) && (start < size) ) {
      return this.tokens.get(start).getStartNode().getOffset();
    }
    // error code
    return -1L;
  }


  /**
   * End character offset of the last token in the given Span. Note that
   * OpenNLP's chunker and NER return token offsets between tokens, so a 
   * chunk marked (1,3) covers the document content from the beginning 
   * of token 1 to the end of token 2. 
   * This method does the subtraction and range-checking for you,
   * but watch out for -1 (error code) output. 
   * @param i
   * @return the GATE-style end offset in characters
   */
  public long getEndOffset(Span span) {
    int end = span.getEnd() - 1;
    if ( (end >= 0) && (end < size) ) {
      return this.tokens.get(end).getEndNode().getOffset();
    }
    // error code
    return -1L;
  }


  /**
   * String array of the Token.string features.
   * @return
   */
  public String[] getStrings() {
    return this.strings;
  }

  /**
   * String array of the Token.category features.
   * @return
   */
  public String[] getTags() {
    return this.tags;
  }
  
  public Annotation get(int i) {
    return this.tokens.get(i);
  }

  public int size() {
    return this.size;
  }

  /**
   * Iterator over the Token annotations that went into
   * the constructor, but now in LR order.
   */
  public Iterator<Annotation> iterator() {
    return this.tokens.listIterator();
  }
  
  
}