package gate.stanford; import java.util.*; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.ling.TaggedWord; import gate.*; import gate.creole.ANNIEConstants; import gate.util.OffsetComparator; /** * The Stanford Parser itself takes as input a List of edu.stanford.nlp.ling.Word. * This data structure is constructed from a Sentence Annotation, using the enclosed * Token Annotations, and yields the required List, as well as methods for * converting the parser's output spans into GATE Annotation offsets. * * @author Adam Funk * */ public class StanfordSentence { private Map<Integer, Long> startPosToOffset; private Map<Integer, Long> endPosToOffset; private Map<Integer, Annotation> startPosToToken; private List<Word> words; private Long sentenceStartOffset, sentenceEndOffset; private List<Annotation> tokens; private static final String POS_TAG_FEATURE = ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME; private static final String STRING_FEATURE = ANNIEConstants.TOKEN_STRING_FEATURE_NAME; int nbrOfTokens, nbrOfMissingPosTags; /* This is probably dodgy, but I can't find an "unknown" tag * in the Penn documentation. */ private static final String UNKNOWN_TAG = "NN"; @SuppressWarnings("unchecked") public StanfordSentence(Annotation sentence, String tokenType, AnnotationSet inputAS, boolean usePosTags) { OffsetComparator offsetComparator = new OffsetComparator(); Annotation token; String tokenString; startPosToOffset = new HashMap<Integer, Long>(); endPosToOffset = new HashMap<Integer, Long>(); startPosToToken = new HashMap<Integer, Annotation>(); sentenceStartOffset = sentence.getStartNode().getOffset(); sentenceEndOffset = sentence.getEndNode().getOffset(); nbrOfTokens = 0; nbrOfMissingPosTags = 0; tokens = new ArrayList<Annotation>(inputAS.getContained(sentenceStartOffset, sentenceEndOffset).get(tokenType)); java.util.Collections.sort(tokens, offsetComparator); Iterator<Annotation> tokenIter = tokens.iterator(); words = new ArrayList<Word>(); int tokenNo = 0; while(tokenIter.hasNext()) { token = tokenIter.next(); tokenString = escapeToken(token.getFeatures().get(STRING_FEATURE).toString()); add(tokenNo, token); /* The FAQ says the parser will automatically use existing POS tags * if the List elements are of type TaggedWord. * http://nlp.stanford.edu/software/parser-faq.shtml#f */ if (usePosTags) { words.add(new TaggedWord(tokenString, getEscapedPosTag(token))); } else { words.add(new Word(tokenString)); } tokenNo++; } nbrOfTokens = tokenNo; } private String getEscapedPosTag(Annotation token) { String pos = UNKNOWN_TAG; FeatureMap tokenFeatures = token.getFeatures(); if (tokenFeatures.containsKey(POS_TAG_FEATURE)) { Object temp = tokenFeatures.get(POS_TAG_FEATURE); if (temp instanceof String) { pos = (String) temp; } else { nbrOfMissingPosTags++; } } else { nbrOfMissingPosTags++; } return escapePosTag(pos); } private void add(int tokenNbr, Annotation token) { Long tokenStartOffset = token.getStartNode().getOffset(); Long tokenEndOffset = token.getEndNode().getOffset(); Integer tokenNbrInt = new Integer(tokenNbr); startPosToOffset.put(tokenNbrInt, tokenStartOffset); endPosToOffset.put(new Integer(tokenNbr + 1), tokenEndOffset); startPosToToken.put(tokenNbrInt, token); } /* Explanation of the position conversion: * The output of the Stanford Parser specifies each constituent's span in terms of * token boundaries re-numbered within each sentence, which we need to convert to * GATE character offsets within the whole document. * * Example: "This is a test." starting at offset 100, containing five tokens. * Stanford says "This" starts at 0 and ends at 1; GATE says 100 to 104. * Stanford says "is a test" starts at 1 and ends at 4; * GATE says 105 to 114. */ public int numberOfTokens() { return nbrOfTokens; } public int numberOfMissingPosTags() { return nbrOfMissingPosTags; } public boolean isNotEmpty() { return (nbrOfTokens > 0); } /** * Change the Token's string to match the Penn Treebank's * escaping system. * See Stanford parser FAQ "How can I provide the correct tokenization of my * sentence to the parser?" * @param token original string feature of Token * @return escaped version of string */ protected static String escapeToken(String token) { // ( --> -LRB- if (token.equals("(")) { return "-LRB-"; } // ) --> -RRB- if (token.equals(")")) { return "-RRB-"; } // / --> \/ // * --> \* if (token.contains("/") || token.contains("*")) { return token.replace("/", "\\/").replace("*", "\\*"); } return token; } protected static String escapePosTag(String tag) { // ( --> -LRB- if (tag.equals("(")) { return "-LRB-"; } // ) --> -RRB- if (tag.equals(")")) { return "-RRB-"; } return tag; } protected static String unescapePosTag(String tag) { // ( <-- -LRB- if (tag.equals("-LRB-")) { return "("; } // ) <-- -RRB- if (tag.equals("-RRB-")) { return ")"; } return tag; } /** * Convert a Stanford start position to the GATE Annotation of type * "Token" that starts there. */ public Annotation startPos2token(int startPos) { return startPosToToken.get(new Integer(startPos)); } /** * Convert a Stanford start position to a GATE offset. * @param startPos * @return the offset in the GATE document */ public Long startPos2offset(int startPos) { return startPosToOffset.get(new Integer(startPos)); } /** * Convert a Stanford end position to a GATE offset. * @param endPos * @return the offset in the GATE document */ public Long endPos2offset(int endPos) { return endPosToOffset.get(new Integer(endPos)); } /** * @return The data structure that is passed to the Stanford Parser itself. */ public List<Word> getWordList() { return words; } }