GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Parser_Stanford/src/gate/stanford/StanfordSentence.java

package gate.stanford;

import java.util.*;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.TaggedWord;
import gate.*;
import gate.creole.ANNIEConstants;
import gate.util.OffsetComparator;

/**
 * The Stanford Parser itself takes as input a List of edu.stanford.nlp.ling.Word.
 * This data structure is constructed from a Sentence Annotation, using the enclosed
 * Token Annotations, and yields the required List, as well as methods for
 * converting the parser's output spans into GATE Annotation offsets.
 * 
 * @author Adam Funk
 *
 */
public class StanfordSentence {
  
  private Map<Integer, Long> startPosToOffset;
  private Map<Integer, Long> endPosToOffset;
  private Map<Integer, Annotation> startPosToToken;
  private List<Word>         words;
  private Long               sentenceStartOffset, sentenceEndOffset;
  private List<Annotation>   tokens;

  private static final String  POS_TAG_FEATURE    = ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME;
  private static final String  STRING_FEATURE     = ANNIEConstants.TOKEN_STRING_FEATURE_NAME;
  
  int nbrOfTokens, nbrOfMissingPosTags;
  
  
  /* This is probably dodgy, but I can't find an "unknown" tag 
   * in the Penn documentation.    */
  private static final String  UNKNOWN_TAG     = "NN";
  
  @SuppressWarnings("unchecked")
  public StanfordSentence(Annotation sentence, String tokenType, 
    AnnotationSet inputAS, boolean usePosTags) {
    
    OffsetComparator  offsetComparator = new OffsetComparator();
    Annotation token;
    String tokenString;

    startPosToOffset = new HashMap<Integer, Long>();
    endPosToOffset   = new HashMap<Integer, Long>();
    startPosToToken  = new HashMap<Integer, Annotation>();
    
    sentenceStartOffset = sentence.getStartNode().getOffset();
    sentenceEndOffset   = sentence.getEndNode().getOffset();
   
    nbrOfTokens   = 0;
    nbrOfMissingPosTags = 0;
    
    tokens = new ArrayList<Annotation>(inputAS.getContained(sentenceStartOffset, sentenceEndOffset).get(tokenType));
    java.util.Collections.sort(tokens, offsetComparator);

    Iterator<Annotation> tokenIter = tokens.iterator();
    words = new ArrayList<Word>();
    int tokenNo = 0;

    while(tokenIter.hasNext()) {
      token = tokenIter.next();
      tokenString = escapeToken(token.getFeatures().get(STRING_FEATURE).toString());
      add(tokenNo, token);
      
      /* The FAQ says the parser will automatically use existing POS tags
       * if the List elements are of type TaggedWord.  
       * http://nlp.stanford.edu/software/parser-faq.shtml#f
       */
      
      if (usePosTags)  {
        words.add(new TaggedWord(tokenString, getEscapedPosTag(token)));
      }
      else {
        words.add(new Word(tokenString));
      }

      tokenNo++;
    }
    
    nbrOfTokens = tokenNo;

  }

  
  
  private String getEscapedPosTag(Annotation token)  {
    String pos = UNKNOWN_TAG;
    FeatureMap tokenFeatures = token.getFeatures();

    if (tokenFeatures.containsKey(POS_TAG_FEATURE)) {
      Object temp = tokenFeatures.get(POS_TAG_FEATURE);
      
      if (temp instanceof String) {
        pos = (String) temp;
      }
      else {
        nbrOfMissingPosTags++;
      }
      
    }
    else {
      nbrOfMissingPosTags++;
    }
    
    return escapePosTag(pos);
  }
  


  private void add(int tokenNbr, Annotation token) {
    Long tokenStartOffset = token.getStartNode().getOffset();
    Long tokenEndOffset   = token.getEndNode().getOffset();
    Integer tokenNbrInt = new Integer(tokenNbr);

    startPosToOffset.put(tokenNbrInt, tokenStartOffset);
    endPosToOffset.put(new Integer(tokenNbr + 1), tokenEndOffset);
    startPosToToken.put(tokenNbrInt, token);
  }
  

  
  /* Explanation of the position conversion:
   * The output of the Stanford Parser specifies each constituent's span in terms of 
   * token boundaries re-numbered within each sentence, which we need to convert to 
   * GATE character offsets within the whole document.
   * 
   * Example: "This is a test." starting at offset 100, containing five tokens.
   * Stanford says "This" starts at 0 and ends at 1; GATE says 100 to 104.
   * Stanford says "is a test" starts at 1 and ends at 4;
   * GATE says 105 to 114.
   */
  
  
  public int numberOfTokens() {
    return nbrOfTokens;
  }
  
  public int numberOfMissingPosTags() {
    return nbrOfMissingPosTags;
  }
  
  public boolean isNotEmpty() {
    return (nbrOfTokens > 0);
  }
  
  
  /**
   * Change the Token's string to match the Penn Treebank's 
   * escaping system.
   * See Stanford parser FAQ "How can I provide the correct tokenization of my 
   * sentence to the parser?"  

   * @param token original string feature of Token
   * @return escaped version of string
   */
  protected static String escapeToken(String token) {
    //   (  -->  -LRB-
    if (token.equals("(")) {
      return "-LRB-";
    }
    
    //   )  -->  -RRB-
    if (token.equals(")")) {
      return "-RRB-";
    }
    
    //   /  -->  \/
    //   *  -->  \*
    if (token.contains("/") || token.contains("*")) {
      return token.replace("/", "\\/").replace("*", "\\*");
    }
    
    return token;
  }
  

  protected static String escapePosTag(String tag) {
    //   (  -->  -LRB-
    if (tag.equals("(")) {
      return "-LRB-";
    }
    
    //   )  -->  -RRB-
    if (tag.equals(")")) {
      return "-RRB-";
    }
    
    return tag;
  }

  
  protected static String unescapePosTag(String tag) {
    //   (  <--  -LRB-
    if (tag.equals("-LRB-")) {
      return "(";
    }
    
    //   )  <--  -RRB-
    if (tag.equals("-RRB-")) {
      return ")";
    }
    
    return tag;
  }
  

  /**
   * Convert a Stanford start position to the GATE Annotation of type
   * "Token" that starts there.
   */
  public Annotation startPos2token(int startPos) {
    return startPosToToken.get(new Integer(startPos));
  }

  /**
   * Convert a Stanford start position to a GATE offset.
   * @param startPos
   * @return the offset in the GATE document
   */
  public Long startPos2offset(int startPos) {
    return startPosToOffset.get(new Integer(startPos));
  }

  /**
   * Convert a Stanford end position to a GATE offset.
   * @param endPos
   * @return the offset in the GATE document
   */
  public Long endPos2offset(int endPos) {
    return endPosToOffset.get(new Integer(endPos));
  }

  
  /**
   * @return The data structure that is passed to the Stanford Parser itself.
   */
  public List<Word> getWordList() {
    return words;
  }
}