package gate.stanford;
import edu.stanford.nlp.parser.lexparser.*;
import edu.stanford.nlp.trees.*;
import gate.*;
import gate.util.*;
import gate.creole.*;
import gate.creole.metadata.*;
import java.io.*;
import java.net.*;
import java.util.*;
import edu.stanford.nlp.ling.*;
/**
* GATE PR wrapper around the Stanford Parser. This class expects to find Token
* and Sentence annotations (such as those created by the ANNIE tokenizer and
* splitter) already in the inputAS and transforms them into suitable data
* structures, which it feeds to the LexicalizedParser. The parser's output can
* be stored in the outputAS in various ways, controlled by CREOLE run-time
* parameters.
* @author adam
*/
@CreoleResource(name = "StanfordParser", comment = "Stanford parser wrapper",
helpURL = "http://gate.ac.uk/userguide/sec:parsers:stanford")
public class Parser extends AbstractLanguageAnalyser
implements ProcessingResource {
private static final long serialVersionUID = -3062171258011850283L;
protected edu.stanford.nlp.parser.lexparser.LexicalizedParser stanfordParser;
/* Type "SyntaxTreeNode" with feature "cat" is compatible with the
* classic SyntaxTreeViewer. */
private static final String OUTPUT_PHRASE_TYPE = "SyntaxTreeNode" ;
private static final String PSG_TAG_FEATURE = "cat" ;
/* But "category" feature is compatible with the ANNIE POS tagger. */
private static final String POS_TAG_FEATURE = ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME;
private static final String inputSentenceType = ANNIEConstants.SENTENCE_ANNOTATION_TYPE;
private static final String inputTokenType = ANNIEConstants.TOKEN_ANNOTATION_TYPE;
private static final String DEP_ANNOTATION_TYPE = "Dependency";
private static final String DEP_ARG_FEATURE = "args";
private static final String DEP_LABEL_FEATURE = "kind";
protected String annotationSetName;
protected AnnotationSet annotationSet;
protected gate.Document document;
private URL parserFile;
protected boolean debugMode;
private boolean reusePosTags;
private OffsetComparator offsetComparator;
private Map<String, String> tagMap;
protected StanfordSentence stanfordSentence;
protected GrammaticalStructureFactory gsf;
/* CREOLE parameters for optional mapping */
private boolean useMapping = false;
private URL mappingFileURL;
/* internal variables for mapping */
private File mappingFile;
private boolean mappingLoaded = false;
/* CREOLE parameters: what are we going to annotate, and how? */
private boolean addConstituentAnnotations;
private boolean addDependencyFeatures;
private boolean addDependencyAnnotations;
private boolean addPosTags;
/**
* The {@link TreebankLangParserParams} implementation to use. This is
* where we get the language pack, and then the
* {@link GrammaticalStructureFactory} used to extract the
* dependencies from the parse. In most cases you should leave this at
* the default value, which is suitable for English text.
*/
protected String tlppClass =
"edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams";
public String getTlppClass() {
return tlppClass;
}
@CreoleParameter(comment = "Class name of the TreebankLangParserParams "
+ "implementation used to extract the dependencies",
defaultValue =
"edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams")
public void setTlppClass(String tlppClass) {
this.tlppClass = tlppClass;
}
/**
* The name of the feature to add to tokens. The feature value is a
* {@link List} of {@link DependencyRelation} objects giving the
* dependencies from this token to other tokens.
*/
protected String dependenciesFeature = "dependencies";
/**
* Parse the current document. (This is the principal
* method called by a CorpusController.)
*/
public void execute() throws ExecutionException {
annotationSet = convertASName(annotationSetName);
if (debugMode) {
System.out.println("Parsing document: " + document.getName());
}
if (useMapping && (! mappingLoaded) ) {
System.err.println("Warning: no mapping loaded!");
}
if (addConstituentAnnotations || addDependencyFeatures || addDependencyAnnotations) {
parseSentences();
}
else {
System.err.println("There is nothing for the parser to do.");
System.err.println("Please enable at least one of the \"add...\" options.");
}
}
/**
* Initialize the Parser resource. In particular, load the trained data
* file.
*/
public Resource init() throws ResourceInstantiationException {
instantiateStanfordParser();
if (mappingFile != null) {
loadTagMapping(mappingFile);
}
offsetComparator = new OffsetComparator();
super.init();
if(tlppClass == null || tlppClass.equals("")) {
throw new ResourceInstantiationException(
"TLPP class name must be specified");
}
try {
Class<?> tlppClassObj =
Class.forName(tlppClass, true, Gate.getClassLoader());
if(!TreebankLangParserParams.class.isAssignableFrom(tlppClassObj)) {
throw new ResourceInstantiationException(tlppClassObj
+ " does not implement "
+ TreebankLangParserParams.class.getName());
}
TreebankLangParserParams tlpp =
TreebankLangParserParams.class.cast(tlppClassObj.newInstance());
gsf = tlpp.treebankLanguagePack().grammaticalStructureFactory();
}
catch(ClassNotFoundException e) {
throw new ResourceInstantiationException("Class " + tlppClass
+ " not found", e);
}
catch(InstantiationException e) {
throw new ResourceInstantiationException("Error creating TLPP object", e);
}
catch(IllegalAccessException e) {
throw new ResourceInstantiationException("Error creating TLPP object", e);
}
return this;
}
/**
* Re-initialize the Parser resource. In particular, reload the trained
* data file.
*/
@Override public void reInit() throws ResourceInstantiationException {
init();
}
/**
* Find all the Sentence annotations and iterate through them, parsing one
* sentence at a time and storing the result in the output AS. (Sentences are
* scanned for Tokens. You have to run the ANNIE tokenizer and splitter before
* this PR.)
*/
@SuppressWarnings("unchecked")
private void parseSentences() {
List<Annotation> sentences = new ArrayList<Annotation>(annotationSet.get(inputSentenceType));
java.util.Collections.sort(sentences, offsetComparator);
Iterator<Annotation> sentenceIter = sentences.iterator();
Tree tree;
int debugNbrS, debugS;
debugS = 0;
debugNbrS = sentences.size();
while (sentenceIter.hasNext()) {
debugS++;
tree = parseOneSentence(sentenceIter.next(), debugS);
// Here null is the result from an empty Sentence.
if (tree != null) {
if (addConstituentAnnotations || addPosTags) {
annotatePhraseStructureRecursively(tree, tree);
}
if (addDependencyFeatures || addDependencyAnnotations) {
annotateDependencies(tree);
}
if (debugMode) {
System.out.println("Parsed sentence " + debugS + " of " + debugNbrS);
}
}
else if (debugMode) {
System.out.println("Ignored empty sentence " + debugS + " of " + debugNbrS);
}
}
}
/**
* Generate the special data structure for one sentence and pass the List of
* Word to the parser.
*
* @param sentence
* the Sentence annotation
* @param s
* sentence number of debugging output
* @param ofS
* total number of sentences for debugging output
* @return null if the sentence is empty
*/
private Tree parseOneSentence(Annotation sentence, int sentenceNo) {
Tree result = null;
stanfordSentence = new StanfordSentence(sentence, inputTokenType, annotationSet, reusePosTags);
/* Ignore an empty Sentence (sometimes the regex splitter can create one
* with no Token annotations in it).
*/
if ( stanfordSentence.isNotEmpty() ) {
List<Word> wordList = stanfordSentence.getWordList();
if (reusePosTags) {
int nbrMissingTags = stanfordSentence.numberOfMissingPosTags();
if (nbrMissingTags > 0) {
double percentMissing = Math.ceil(100.0 * ((float) nbrMissingTags) /
((float) stanfordSentence.numberOfTokens()) );
System.err.println("Warning (sentence " + sentenceNo + "): " + (int) percentMissing
+ "% of the Tokens are missing POS tags." );
}
}
stanfordParser.parse(wordList);
result = stanfordParser.getBestParse();
}
return result;
}
/**
* Generate a SyntaxTreeNode Annotation corresponding to this Tree. Work
* recursively so that the annotations are actually generated from the
* bottom up, in order to build the consists list of annotation IDs.
*
* @param tree the current subtree
* @param rootTree the whole sentence, used to find the span of the current subtree
* @return a GATE Annotation of type "SyntaxTreeNode"
*/
protected Annotation annotatePhraseStructureRecursively(Tree tree, Tree rootTree) {
Annotation annotation = null;
Annotation child;
String label = tree.value();
List<Tree> children = tree.getChildrenAsList();
if (children.size() == 0) {
return null;
}
/* implied else */
/* following line generates ClassCastException
* IntPair span = tree.getSpan();
* edu.stanford.nlp.ling.CategoryWordTag
* at edu.stanford.nlp.trees.Tree.getSpan(Tree.java:393)
* but I think it's a bug in the parser, so I'm hacking
* around it as follows. */
int startPos = Trees.leftEdge(tree, rootTree);
int endPos = Trees.rightEdge(tree, rootTree);
Long startNode = stanfordSentence.startPos2offset(startPos);
Long endNode = stanfordSentence.endPos2offset(endPos);
List<Integer> consists = new ArrayList<Integer>();
Iterator<Tree> childIter = children.iterator();
while (childIter.hasNext()) {
child = annotatePhraseStructureRecursively(childIter.next(), rootTree);
if ( (child != null) &&
(! child.getType().equals(inputTokenType) )) {
consists.add(child.getId());
}
}
annotation = annotatePhraseStructureConstituent(startNode, endNode, label, consists, tree.depth());
return annotation;
}
/**
* Record one constituent as an annotation.
*
* @param startOffset
* @param endOffset
* @param label
* @param consists
* @param depth
* @return
*/
private Annotation annotatePhraseStructureConstituent(Long startOffset, Long endOffset, String label,
List<Integer> consists, int depth) {
Annotation phrAnnotation = null;
Integer phrID;
try {
String cat;
if (useMapping && mappingLoaded) {
cat = translateTag(label);
}
else {
cat = label;
}
if (addConstituentAnnotations) {
String text = document.getContent().getContent(startOffset, endOffset).toString();
FeatureMap fm = gate.Factory.newFeatureMap();
fm.put(PSG_TAG_FEATURE, cat);
fm.put("text", text);
/* Ignore empty list features on the token-equivalent annotations. */
if (consists.size() > 0) {
fm.put("consists", consists);
}
phrID = annotationSet.add(startOffset, endOffset, OUTPUT_PHRASE_TYPE, fm);
phrAnnotation = annotationSet.get(phrID);
recordID(annotationSet, phrID);
}
if ( addPosTags && (depth == 1) ) {
/* Expected to be a singleton set! */
AnnotationSet tokenSet = annotationSet.get(inputTokenType, startOffset, endOffset);
if (tokenSet.size() == 1) {
Annotation token = tokenSet.iterator().next();
/* Add POS tag to token.
* (Note: GATE/Hepple uses "(" and ")" for Penn/Stanford's
* "-LRB-" and "-RRB-". */
String hepCat = StanfordSentence.unescapePosTag(cat);
token.getFeatures().put(POS_TAG_FEATURE, hepCat);
}
else {
System.err.println("Found a tokenSet with " + tokenSet.size() + " members!");
}
}
}
catch (InvalidOffsetException e) {
e.printStackTrace();
}
return phrAnnotation;
}
@SuppressWarnings("unchecked")
private void annotateDependencies(Tree tree) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> deps = gs.typedDependencies();
String dependencyKind;
FeatureMap depFeatures;
Integer dependentTokenID, governorTokenID;
List<Integer> argList;
Long offsetLH0, offsetRH0, offsetLH1, offsetRH1, depLH, depRH;
Annotation governorToken, dependentToken;
for(TypedDependency d : deps) {
if(debugMode) {
System.out.println(d);
}
int governorIndex = ((HasIndex) d.gov().label()).index() - 1;
governorToken = stanfordSentence.startPos2token(governorIndex);
int dependentIndex = ((HasIndex) d.dep().label()).index() - 1;
dependentToken = stanfordSentence.startPos2token(dependentIndex);
dependencyKind = d.reln().toString();
governorTokenID = governorToken.getId();
dependentTokenID = dependentToken.getId();
if (addDependencyFeatures) {
List<DependencyRelation> depsForTok =
(List<DependencyRelation>) governorToken.getFeatures().get(dependenciesFeature);
if(depsForTok == null) {
depsForTok = new ArrayList<DependencyRelation>();
governorToken.getFeatures().put(dependenciesFeature, depsForTok);
}
depsForTok.add(new DependencyRelation(dependencyKind, dependentTokenID));
}
if (addDependencyAnnotations) {
depFeatures = gate.Factory.newFeatureMap();
argList = new ArrayList<Integer>();
argList.add(governorTokenID);
argList.add(dependentTokenID);
depFeatures.put(DEP_ARG_FEATURE, argList);
depFeatures.put(DEP_LABEL_FEATURE, dependencyKind);
offsetLH0 = governorToken.getStartNode().getOffset();
offsetRH0 = governorToken.getEndNode().getOffset();
offsetLH1 = dependentToken.getStartNode().getOffset();
offsetRH1 = dependentToken.getEndNode().getOffset();
depLH = Math.min(offsetLH0, offsetLH1);
depRH = Math.max(offsetRH0, offsetRH1);
try {
annotationSet.add(depLH, depRH, DEP_ANNOTATION_TYPE, depFeatures);
}
catch(InvalidOffsetException e) {
e.printStackTrace();
}
}
}
}
private void instantiateStanfordParser()
throws ResourceInstantiationException {
try {
String filepath = Files.fileFromURL(parserFile).getAbsolutePath();
stanfordParser = new LexicalizedParser(filepath);
}
catch(Exception e) {
throw new ResourceInstantiationException(e);
}
}
private void loadTagMapping(File mappingFile) {
tagMap = new HashMap<String, String>();
mappingLoaded = false;
try {
if (mappingFile.exists() && mappingFile.canRead()) {
BufferedReader br = new BufferedReader(new FileReader(mappingFile));
String line = "";
// read until it reaches to an end of the file
while((line = br.readLine()) != null) {
// two columns delimited by whitespace
String [] data = line.split("\\s+", 2);
// are there key and value available
if(data == null || data.length < 2) {
continue;
} else {
// and add it to the map
tagMap.put(data[0].trim(), data[1].trim());
}
}
br.close();
}
else {
System.err.println("Can't find or read mapping file "
+ mappingFile.getPath() + " so no mappings will be used.");
}
}
catch(Exception e) {
System.err.println("Exception trying to load mapping file "
+ mappingFile.getPath());
e.printStackTrace();
}
int nbrMapped = tagMap.size();
System.out.println("Loaded " + nbrMapped + " mappings from file " + mappingFile);
mappingLoaded = (nbrMapped > 0);
}
/**
* This method stores the annotation ID as a value of feature "ID" on the
* relevant annotation. (Mainly to make the ID visible in the GUI for
* debugging.)
*
* @param annSet
* @param annotationID
*/
private void recordID(AnnotationSet annSet, Integer annotationID) {
annSet.get(annotationID).getFeatures().put("ID", annotationID);
}
/**
* Translate the tag in the map, or leave it the same if there is no
* translation.
*
* @param stanfordTag
* @return
*/
private String translateTag(String stanfordTag) {
String translatedTag = stanfordTag;
if (tagMap.containsKey(stanfordTag)) {
translatedTag = tagMap.get(stanfordTag);
}
return translatedTag;
}
protected AnnotationSet convertASName(String name) {
if ((name == null) || name.equals("") ) {
return document.getAnnotations();
}
/* implied else */
return document.getAnnotations(name);
}
/* get & set methods for the CREOLE parameters */
@Optional
@RunTime
@CreoleParameter(comment = "annotationSet used for input (Token and "
+ "Sentence annotations) and output")
public void setAnnotationSetName(String annotationSetName) {
this.annotationSetName = annotationSetName;
}
public String getAnnotationSetName() {
return this.annotationSetName;
}
@Optional
@CreoleParameter(comment = "path to the parser's grammar file",
defaultValue = "resources/englishPCFG.ser.gz")
public void setParserFile(URL parserFile) {
this.parserFile = parserFile;
}
public URL getParserFile() {
return this.parserFile;
}
@RunTime
@CreoleParameter(comment = "The document to be processed")
public void setDocument(gate.Document document) {
this.document = document;
}
public gate.Document getDocument() {
return this.document;
}
@RunTime
@CreoleParameter(comment = "verbose mode for debugging",
defaultValue = "false")
public void setDebug(Boolean debug) {
this.debugMode = debug.booleanValue();
}
public Boolean getDebug() {
return new Boolean(this.debugMode);
}
@RunTime
@CreoleParameter(comment = "Re-use existing POS tags on tokens",
defaultValue = "false")
public void setReusePosTags(Boolean reusePosTags) {
this.reusePosTags = reusePosTags.booleanValue();
}
public Boolean getReusePosTags() {
return new Boolean(this.reusePosTags);
}
@RunTime
@CreoleParameter(comment = "Create POS tags on the Token annotations",
defaultValue = "false")
public void setAddPosTags(Boolean posTagTokens) {
this.addPosTags = posTagTokens.booleanValue();
}
public Boolean getAddPosTags() {
return new Boolean(this.addPosTags);
}
@RunTime
@CreoleParameter(comment = "use tag mapping",
defaultValue = "false")
public void setUseMapping(Boolean useMapping) {
this.useMapping = useMapping.booleanValue();
}
public Boolean getUseMapping() {
return new Boolean(this.useMapping);
}
@RunTime
@CreoleParameter(comment = "Create dependency features on Token annotations",
defaultValue = "true")
public void setAddDependencyFeatures(Boolean useDependency) {
this.addDependencyFeatures = useDependency.booleanValue();
}
public Boolean getAddDependencyFeatures() {
return new Boolean(this.addDependencyFeatures);
}
@RunTime
@CreoleParameter(comment = "Create annotations to show dependencies",
defaultValue = "true")
public void setAddDependencyAnnotations(Boolean useDependency) {
this.addDependencyAnnotations = useDependency.booleanValue();
}
public Boolean getAddDependencyAnnotations() {
return new Boolean(this.addDependencyAnnotations);
}
@RunTime
@CreoleParameter(comment = "Create annotations to show phrase structures",
defaultValue = "true")
public void setAddConstituentAnnotations(Boolean usePhraseStructure) {
this.addConstituentAnnotations = usePhraseStructure.booleanValue();
}
public Boolean getAddConstituentAnnotations() {
return new Boolean(this.addConstituentAnnotations);
}
/* Made mappingFile an init parameter to simplify things.
* The CREOLE parameter is called "mappingFile" but it's actually a URL.
*/
@Optional
@CreoleParameter(comment = "path to the tag mapping file")
public void setMappingFile(URL mappingFileURL) {
this.mappingFile = null; // override below
this.mappingFileURL = mappingFileURL;
if ( (this.mappingFileURL != null) &&
(! this.mappingFileURL.toString().trim().equals("")) ) {
try {
this.mappingFile = new File(this.mappingFileURL.toURI());
}
catch(URISyntaxException e) {
e.printStackTrace();
}
}
}
public URL getMappingFile() {
return this.mappingFileURL;
}
}