/* * Copyright (c) 2006-2016, The University of Sheffield. See the file * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * * $Id: Parser.java 17831 2014-04-15 09:37:23Z ian_roberts $ */ package gate.stanford; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams; import edu.stanford.nlp.trees.GrammaticalStructure; import edu.stanford.nlp.trees.GrammaticalStructureFactory; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.Trees; import edu.stanford.nlp.trees.TypedDependency; import gate.Annotation; import gate.AnnotationSet; import gate.Factory; import gate.FeatureMap; import gate.ProcessingResource; import gate.Resource; import gate.creole.ANNIEConstants; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ExecutionInterruptedException; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; import gate.creole.metadata.Sharable; import gate.util.Files; import gate.util.InvalidOffsetException; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.net.URISyntaxException; import java.net.URL; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; /** * GATE PR wrapper around the Stanford Parser. This class expects to find Token * and Sentence annotations (such as those created by the ANNIE tokenizer and * splitter) already in the inputAS and transforms them into suitable data * structures, which it feeds to the LexicalizedParser. The parser's output can * be stored in the outputAS in various ways, controlled by CREOLE run-time * parameters. */ @CreoleResource(name = "StanfordParser", comment = "Stanford parser wrapper", helpURL = "http://gate.ac.uk/userguide/sec:parsers:stanford") public class Parser extends AbstractLanguageAnalyser implements ProcessingResource { private static final long serialVersionUID = -3062171258011850283L; protected LexicalizedParser stanfordParser; /* Type "SyntaxTreeNode" with feature "cat" is compatible with the * classic SyntaxTreeViewer. */ public static final String PHRASE_ANNOTATION_TYPE = "SyntaxTreeNode" ; public static final String PHRASE_CAT_FEATURE = "cat" ; /* But "category" feature is compatible with the ANNIE POS tagger. */ private static final String POS_TAG_FEATURE = ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME; public static final String DEPENDENCY_ANNOTATION_TYPE = "Dependency"; public static final String DEPENDENCY_ARG_FEATURE = "args"; public static final String DEPENDENCY_LABEL_FEATURE = "kind"; protected String annotationSetName; private URL parserFile; protected boolean debugMode; private boolean reusePosTags; private Map<String, String> tagMap; protected GrammaticalStructureFactory gsf; /* CREOLE parameters for optional mapping */ private boolean useMapping = false; private URL mappingFileURL; /* internal variables for mapping */ private File mappingFile; private boolean mappingLoaded = false; /* CREOLE parameters: what are we going to annotate, and how? */ private String inputSentenceType; private String inputTokenType; private boolean addConstituentAnnotations; private boolean addDependencyFeatures; private boolean addDependencyAnnotations; private boolean addPosTags; private boolean includeExtraDependencies; private DependencyMode dependencyMode; /** * The {@link TreebankLangParserParams} implementation to use. This is * where we get the language pack, and then the * {@link GrammaticalStructureFactory} used to extract the * dependencies from the parse. In most cases you should leave this at * the default value, which is suitable for English text. */ private String tlppClass; /** * The name of the feature to add to tokens. The feature value is a * {@link List} of {@link DependencyRelation} objects giving the * dependencies from this token to other tokens. */ protected String dependenciesFeature = "dependencies"; /** * Parse the current document. (This is the principal * method called by a CorpusController.) */ public void execute() throws ExecutionException { interrupted = false; long startTime = System.currentTimeMillis(); if(document == null) { throw new ExecutionException("No document to process!"); } fireStatusChanged("Running " + this.getName() + " on " + document.getName()); fireProgressChanged(0); if (debugMode) { System.out.println("Parsing document: " + document.getName()); } if (useMapping && (! mappingLoaded) ) { System.err.println("Warning: no mapping loaded!"); } checkInterruption(); if (addConstituentAnnotations || addDependencyFeatures || addDependencyAnnotations || addPosTags) { parseSentences(document.getAnnotations(annotationSetName)); } else { System.err.println("There is nothing for the parser to do."); System.err.println("Please enable at least one of the \"add...\" options."); } fireProcessFinished(); fireStatusChanged("Finished " + this.getName() + " on " + document.getName() + " in " + NumberFormat.getInstance().format( (double)(System.currentTimeMillis() - startTime) / 1000) + " seconds!"); } /** * Initialize the Parser resource. In particular, load the trained data * file. */ public Resource init() throws ResourceInstantiationException { instantiateStanfordParser(); if (mappingFile != null) { loadTagMapping(mappingFile); } super.init(); if(tlppClass == null || tlppClass.equals("")) { throw new ResourceInstantiationException( "TLPP class name must be specified"); } try { Class<?> tlppClassObj = Class.forName(tlppClass); if(!TreebankLangParserParams.class.isAssignableFrom(tlppClassObj)) { throw new ResourceInstantiationException(tlppClassObj + " does not implement " + TreebankLangParserParams.class.getName()); } TreebankLangParserParams tlpp = TreebankLangParserParams.class.cast(tlppClassObj.newInstance()); gsf = tlpp.treebankLanguagePack().grammaticalStructureFactory(); } catch(UnsupportedOperationException e) { throw new ResourceInstantiationException(e); } catch(ClassNotFoundException e) { throw new ResourceInstantiationException("Class " + tlppClass + " not found", e); } catch(InstantiationException e) { throw new ResourceInstantiationException("Error creating TLPP object", e); } catch(IllegalAccessException e) { throw new ResourceInstantiationException("Error creating TLPP object", e); } return this; } /** * Re-initialize the Parser resource. In particular, reload the trained * data file. */ @Override public void reInit() throws ResourceInstantiationException { stanfordParser = null; init(); } /** * Find all the Sentence annotations and iterate through them, parsing one * sentence at a time and storing the result in the output AS. (Sentences are * scanned for Tokens. You have to run the ANNIE tokenizer and splitter before * this PR.) * @throws ExecutionInterruptedException */ private void parseSentences(AnnotationSet annotationSet) throws ExecutionInterruptedException { List<Annotation> sentences = gate.Utils.inDocumentOrder(annotationSet.get(inputSentenceType)); int sentencesDone = 0; int nbrSentences = sentences.size(); for (Annotation sentence : sentences) { parseOneSentence(annotationSet, sentence, sentencesDone, nbrSentences); sentencesDone++; checkInterruption(); } sentencesDone++; fireProgressChanged(100 * sentencesDone / nbrSentences); } /** * Generate the special data structure for one sentence and pass the List of * Word to the parser. Apply the annotations back to the document. * * @param sentence * the Sentence annotation * @param s * sentence number of debugging output * @param ofS * total number of sentences for debugging output * @return null if the sentence is empty * @throws ExecutionInterruptedException */ private void parseOneSentence(AnnotationSet annotationSet, Annotation sentence, int sentCtr, int sentCount) throws ExecutionInterruptedException { Tree tree; StanfordSentence stanfordSentence = new StanfordSentence(sentence, inputTokenType, annotationSet, reusePosTags); if (debugMode) { System.out.println(stanfordSentence.toString()); } /* Ignore an empty Sentence (sometimes the regex splitter can create one * with no Token annotations in it). */ if ( stanfordSentence.isNotEmpty() ) { List<Word> wordList = stanfordSentence.getWordList(); if (reusePosTags) { int nbrMissingTags = stanfordSentence.numberOfMissingPosTags(); if (nbrMissingTags > 0) { double percentMissing = Math.ceil(100.0 * (nbrMissingTags) / (stanfordSentence.numberOfTokens()) ); System.err.println("Warning (sentence " + sentCtr + "): " + (int) percentMissing + "% of the Tokens are missing POS tags." ); } } tree = stanfordParser.parse(wordList); checkInterruption(); if (addConstituentAnnotations || addPosTags) { annotatePhraseStructureRecursively(annotationSet, stanfordSentence, tree, tree); } checkInterruption(); if (addDependencyFeatures || addDependencyAnnotations) { annotateDependencies(annotationSet, stanfordSentence, tree); } if (debugMode) { System.out.println("Parsed sentence " + sentCtr + " of " + sentCount); } } else if (debugMode) { System.out.println("Ignored empty sentence " + sentCtr + " of " + sentCount); } } /** * Generate a SyntaxTreeNode Annotation corresponding to this Tree. Work * recursively so that the annotations are actually generated from the * bottom up, in order to build the consists list of annotation IDs. * * @param tree the current subtree * @param rootTree the whole sentence, used to find the span of the current subtree * @return a GATE Annotation of type "SyntaxTreeNode" */ protected Annotation annotatePhraseStructureRecursively(AnnotationSet annotationSet, StanfordSentence stanfordSentence, Tree tree, Tree rootTree) { Annotation annotation = null; Annotation child; String label = tree.value(); List<Tree> children = tree.getChildrenAsList(); if (children.size() == 0) { return null; } /* implied else */ /* following line generates ClassCastException * IntPair span = tree.getSpan(); * edu.stanford.nlp.ling.CategoryWordTag * at edu.stanford.nlp.trees.Tree.getSpan(Tree.java:393) * but I think it's a bug in the parser, so I'm hacking * around it as follows. */ int startPos = Trees.leftEdge(tree, rootTree); int endPos = Trees.rightEdge(tree, rootTree); Long startNode = stanfordSentence.startPos2offset(startPos); Long endNode = stanfordSentence.endPos2offset(endPos); List<Integer> consists = new ArrayList<Integer>(); Iterator<Tree> childIter = children.iterator(); while (childIter.hasNext()) { child = annotatePhraseStructureRecursively(annotationSet, stanfordSentence, childIter.next(), rootTree); if ( (child != null) && (! child.getType().equals(inputTokenType) )) { consists.add(child.getId()); } } annotation = annotatePhraseStructureConstituent(annotationSet, startNode, endNode, label, consists, tree.depth()); return annotation; } /** * Record one constituent as an annotation. * * @param startOffset * @param endOffset * @param label * @param consists * @param depth * @return */ private Annotation annotatePhraseStructureConstituent(AnnotationSet annotationSet, Long startOffset, Long endOffset, String label, List<Integer> consists, int depth) { Annotation phrAnnotation = null; Integer phrID; try { String cat; if (useMapping && mappingLoaded) { cat = translateTag(label); } else { cat = label; } if (addConstituentAnnotations) { String text = document.getContent().getContent(startOffset, endOffset).toString(); FeatureMap fm = gate.Factory.newFeatureMap(); fm.put(PHRASE_CAT_FEATURE, cat); fm.put("text", text); /* Ignore empty list features on the token-equivalent annotations. */ if (consists.size() > 0) { fm.put("consists", consists); } phrID = annotationSet.add(startOffset, endOffset, PHRASE_ANNOTATION_TYPE, fm); phrAnnotation = annotationSet.get(phrID); recordID(annotationSet, phrID); } if ( addPosTags && (depth == 1) ) { /* Expected to be a singleton set! */ AnnotationSet tokenSet = annotationSet.get(inputTokenType, startOffset, endOffset); if (tokenSet.size() == 1) { Annotation token = tokenSet.iterator().next(); /* Add POS tag to token. * (Note: GATE/Hepple uses "(" and ")" for Penn/Stanford's * "-LRB-" and "-RRB-". */ String hepCat = StanfordSentence.unescapePosTag(cat); token.getFeatures().put(POS_TAG_FEATURE, hepCat); } else { System.err.println("Found a tokenSet with " + tokenSet.size() + " members!"); } } } catch (InvalidOffsetException e) { e.printStackTrace(); } return phrAnnotation; } @SuppressWarnings("unchecked") private void annotateDependencies(AnnotationSet annotationSet, StanfordSentence stanfordSentence, Tree tree) { GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> dependencies = DependencyMode.getDependencies(gs, dependencyMode, includeExtraDependencies); if (dependencies == null) { if (debugMode) { System.out.println("dependencies == null"); } return; } String dependencyKind; FeatureMap depFeatures; Integer dependentTokenID, governorTokenID; List<Integer> argList; Long offsetLH0, offsetRH0, offsetLH1, offsetRH1, depLH, depRH; Annotation governor, dependent; for(TypedDependency dependency : dependencies) { if(debugMode) { System.out.println(dependency); } // Does not work in version 3.5.2 any more //int governorIndex = dependency.gov().label().index() - 1; int governorIndex = dependency.gov().index()-1; governor = stanfordSentence.startPos2token(governorIndex); //int dependentIndex = dependency.dep().label().index() - 1; int dependentIndex = dependency.dep().index()-1; dependent = stanfordSentence.startPos2token(dependentIndex); dependencyKind = dependency.reln().toString(); governorTokenID = governor.getId(); dependentTokenID = dependent.getId(); if (addDependencyFeatures) { List<DependencyRelation> depsForTok = (List<DependencyRelation>) governor.getFeatures().get(dependenciesFeature); if(depsForTok == null) { depsForTok = new ArrayList<DependencyRelation>(); governor.getFeatures().put(dependenciesFeature, depsForTok); } depsForTok.add(new DependencyRelation(dependencyKind, dependentTokenID)); } if (addDependencyAnnotations) { depFeatures = gate.Factory.newFeatureMap(); argList = new ArrayList<Integer>(); argList.add(governorTokenID); argList.add(dependentTokenID); depFeatures.put(DEPENDENCY_ARG_FEATURE, argList); depFeatures.put(DEPENDENCY_LABEL_FEATURE, dependencyKind); offsetLH0 = governor.getStartNode().getOffset(); offsetRH0 = governor.getEndNode().getOffset(); offsetLH1 = dependent.getStartNode().getOffset(); offsetRH1 = dependent.getEndNode().getOffset(); depLH = Math.min(offsetLH0, offsetLH1); depRH = Math.max(offsetRH0, offsetRH1); try { annotationSet.add(depLH, depRH, DEPENDENCY_ANNOTATION_TYPE, depFeatures); } catch(InvalidOffsetException e) { e.printStackTrace(); } } } } private void instantiateStanfordParser() throws ResourceInstantiationException { if(stanfordParser != null) return; try { //String filepath = Files.fileFromURL(parserFile).getAbsolutePath(); stanfordParser = LexicalizedParser.getParserFromSerializedFile(parserFile.toExternalForm()); } catch(Exception e) { throw new ResourceInstantiationException(e); } } private void loadTagMapping(File mappingFile) { tagMap = new HashMap<String, String>(); mappingLoaded = false; try { if (mappingFile.exists() && mappingFile.canRead()) { BufferedReader br = new BufferedReader(new FileReader(mappingFile)); String line = ""; // read until it reaches to an end of the file while((line = br.readLine()) != null) { // two columns delimited by whitespace String [] data = line.split("\\s+", 2); // are there key and value available if(data == null || data.length < 2) { continue; } else { // and add it to the map tagMap.put(data[0].trim(), data[1].trim()); } } br.close(); } else { System.err.println("Can't find or read mapping file " + mappingFile.getPath() + " so no mappings will be used."); } } catch(Exception e) { System.err.println("Exception trying to load mapping file " + mappingFile.getPath()); e.printStackTrace(); } int nbrMapped = tagMap.size(); System.out.println("Loaded " + nbrMapped + " mappings from file " + mappingFile); mappingLoaded = (nbrMapped > 0); } /** * This method stores the annotation ID as a value of feature "ID" on the * relevant annotation. (Mainly to make the ID visible in the GUI for * debugging.) * * @param annSet * @param annotationID */ private void recordID(AnnotationSet annSet, Integer annotationID) { annSet.get(annotationID).getFeatures().put("ID", annotationID); } private void checkInterruption() throws ExecutionInterruptedException { if(isInterrupted()) { throw new ExecutionInterruptedException( "Execution of " + this.getName() + " has been abruptly interrupted!"); } } /** * Translate the tag in the map, or leave it the same if there is no * translation. * * @param stanfordTag * @return */ private String translateTag(String stanfordTag) { String translatedTag = stanfordTag; if (tagMap.containsKey(stanfordTag)) { translatedTag = tagMap.get(stanfordTag); } return translatedTag; } /* get & set methods for the CREOLE parameters */ @CreoleParameter(comment = "TreebankLangParserParams implementation used to extract the dependencies", defaultValue = "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams") public void setTlppClass(String tlppClass) { this.tlppClass = tlppClass; } public String getTlppClass() { return tlppClass; } @Optional @RunTime @CreoleParameter(comment = "annotationSet used for input (Token and " + "Sentence annotations) and output") public void setAnnotationSetName(String annotationSetName) { this.annotationSetName = annotationSetName; } public String getAnnotationSetName() { return this.annotationSetName; } @CreoleParameter(comment = "path to the parser's grammar file", defaultValue = "resources/englishRNN.ser.gz") public void setParserFile(URL parserFile) { this.parserFile = parserFile; } public URL getParserFile() { return this.parserFile; } @RunTime @CreoleParameter(comment = "The document to be processed") public void setDocument(gate.Document document) { this.document = document; } public gate.Document getDocument() { return this.document; } @RunTime @CreoleParameter(comment = "verbose mode for debugging", defaultValue = "false") public void setDebug(Boolean debug) { this.debugMode = debug.booleanValue(); } public Boolean getDebug() { return new Boolean(this.debugMode); } @RunTime @CreoleParameter(comment = "Re-use existing POS tags on tokens", defaultValue = "false") public void setReusePosTags(Boolean reusePosTags) { this.reusePosTags = reusePosTags.booleanValue(); } public Boolean getReusePosTags() { return new Boolean(this.reusePosTags); } @RunTime @CreoleParameter(comment = "Create POS tags on the Token annotations", defaultValue = "false") public void setAddPosTags(Boolean posTagTokens) { this.addPosTags = posTagTokens.booleanValue(); } public Boolean getAddPosTags() { return new Boolean(this.addPosTags); } @RunTime @CreoleParameter(comment = "use tag mapping", defaultValue = "false") public void setUseMapping(Boolean useMapping) { this.useMapping = useMapping.booleanValue(); } public Boolean getUseMapping() { return new Boolean(this.useMapping); } @RunTime @CreoleParameter(comment = "Create dependency features on Token annotations", defaultValue = "true") public void setAddDependencyFeatures(Boolean useDependency) { this.addDependencyFeatures = useDependency.booleanValue(); } public Boolean getAddDependencyFeatures() { return new Boolean(this.addDependencyFeatures); } @RunTime @CreoleParameter(comment = "Create annotations to show dependencies", defaultValue = "true") public void setAddDependencyAnnotations(Boolean useDependency) { this.addDependencyAnnotations = useDependency.booleanValue(); } public Boolean getAddDependencyAnnotations() { return new Boolean(this.addDependencyAnnotations); } @RunTime @CreoleParameter(comment = "input annotation type for each sentence", defaultValue = ANNIEConstants.SENTENCE_ANNOTATION_TYPE ) public void setInputSentenceType(String sType) { this.inputSentenceType = sType; } public String getInputSentenceType() { return this.inputSentenceType; } @RunTime @CreoleParameter(comment = "input annotation type for each token", defaultValue = ANNIEConstants.TOKEN_ANNOTATION_TYPE ) public void setInputTokenType(String tType) { this.inputTokenType = tType; } public String getInputTokenType() { return this.inputTokenType; } @RunTime @CreoleParameter(comment = "Create annotations to show phrase structures", defaultValue = "true") public void setAddConstituentAnnotations(Boolean usePhraseStructure) { this.addConstituentAnnotations = usePhraseStructure.booleanValue(); } public Boolean getAddConstituentAnnotations() { return new Boolean(this.addConstituentAnnotations); } @RunTime @CreoleParameter(comment = "Dependency Mode", defaultValue = "Typed") public void setDependencyMode(DependencyMode mode) { this.dependencyMode = mode; } public DependencyMode getDependencyMode() { return this.dependencyMode; } @RunTime @CreoleParameter(comment = "include extra dependencies", defaultValue = "false") public void setIncludeExtraDependencies(Boolean include) { this.includeExtraDependencies = include; } public Boolean getIncludeExtraDependencies() { return this.includeExtraDependencies; } /* Made mappingFile an init parameter to simplify things. * The CREOLE parameter is called "mappingFile" but it's actually a URL. */ @Optional @CreoleParameter(comment = "path to the tag mapping file") public void setMappingFile(URL mappingFileURL) { this.mappingFile = null; // override below this.mappingFileURL = mappingFileURL; if ( (this.mappingFileURL != null) && (! this.mappingFileURL.toString().trim().equals("")) ) { try { this.mappingFile = new File(this.mappingFileURL.toURI()); } catch(URISyntaxException e) { e.printStackTrace(); } } } public URL getMappingFile() { return this.mappingFileURL; } /** * Inject an existing instance of the LexicalizedParser. * <b>This method is intended for use by {@link Factory#ducplicate} * and should not be called directly.</b> */ @Sharable public void setStanfordParser(LexicalizedParser parser) { this.stanfordParser = parser; } /** * Get the LexicalizedParser used internally by this PR. * <b>This method is intended for use by {@link Factory#ducplicate} * and should not be called directly.</b> */ public LexicalizedParser getStanfordParser() { return stanfordParser; } }