/* * Copyright (c) 2006-2016, The University of Sheffield. See the file * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * * Leon Derczynski, 22 Oct 2013 * * $Id: NER.java 15468 2013-10-22 21:13:15Z $ */ package gate.stanford; import java.io.InputStream; import java.net.URL; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.zip.GZIPInputStream; import org.apache.log4j.Level; import org.apache.log4j.Logger; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import gate.Annotation; import gate.AnnotationSet; import gate.Resource; import gate.Utils; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; import gate.creole.metadata.Sharable; import gate.util.InvalidOffsetException; import gate.util.OffsetComparator; import gate.util.SimpleFeatureMapImpl; /** * This class is a wrapper for the Stanford NER tool v3.2.0. */ @CreoleResource(name = "Stanford NER", comment = "Stanford Named Entity Recogniser", icon = "ne-transducer", helpURL="http://gate.ac.uk/userguide/sec:misc:creole:stanford") public class NER extends AbstractLanguageAnalyser { private static final long serialVersionUID = -6001372186847970080L; public static final String TAG_DOCUMENT_PARAMETER_NAME = "document"; public static final String TAG_INPUT_AS_PARAMETER_NAME = "inputASName"; public static final String TAG_ENCODING_PARAMETER_NAME = "encoding"; public static final String BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME = "baseTokenAnnotationType"; public static final String BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME = "baseSentenceAnnotationType"; public static final String TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName"; public static final String TAG_OUTSIDE_LABEL = "outsideLabel"; @RunTime @Optional @CreoleParameter(comment = "Throw an exception when there are none of the required input annotations", defaultValue = "true") public void setFailOnMissingInputAnnotations(Boolean fail) { failOnMissingInputAnnotations = fail; } public Boolean getFailOnMissingInputAnnotations() { return failOnMissingInputAnnotations; } protected Boolean failOnMissingInputAnnotations = true; protected Logger logger = Logger.getLogger(this.getClass().getName()); @Override public Resource init() throws ResourceInstantiationException { if(tagger == null) { fireStatusChanged("Loading Stanford NER model"); try (InputStream in = modelFile.openStream(); GZIPInputStream gzipIn = new GZIPInputStream(in)){ tagger = CRFClassifier.getClassifier(gzipIn); } catch(Exception e) { throw new ResourceInstantiationException(e); } } return this; } @Override public void reInit() throws ResourceInstantiationException { tagger = null; init(); } @Override public void execute() throws ExecutionException { // check the parameters if(document == null) throw new ExecutionException("No document to process!"); AnnotationSet inputAS = document.getAnnotations(inputASName); AnnotationSet outputAS = document.getAnnotations(outputASName); if(baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) { throw new ExecutionException( "No base Token Annotation Type provided!"); } if(baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length() == 0) { throw new ExecutionException( "No base Sentence Annotation Type provided!"); } AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType); AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType); if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && tokensAS.size() > 0) { long startTime = System.currentTimeMillis(); fireStatusChanged("NER searching " + document.getName()); fireProgressChanged(0); // prepare the input for CRFClassifier List<CoreLabel> sentenceForTagger = new ArrayList<CoreLabel>(); // define a comparator for annotations by start offset OffsetComparator offsetComparator = new OffsetComparator(); // read all the tokens and all the sentences List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS); Collections.sort(sentencesList, offsetComparator); List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS); Collections.sort(tokensList, offsetComparator); Iterator<Annotation> sentencesIter = sentencesList.iterator(); ListIterator<Annotation> tokensIter = tokensList.listIterator(); List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>(); Annotation currentToken = tokensIter.next(); int sentIndex = 0; int sentCnt = sentencesAS.size(); // go through sentence annotations in the document while(sentencesIter.hasNext()) { Annotation currentSentence = sentencesIter.next(); // reset sentence-level processing variables tokensInCurrentSentence.clear(); sentenceForTagger.clear(); // while we have sane tokens while(currentToken != null && currentToken.getEndNode().getOffset() .compareTo(currentSentence.getEndNode().getOffset()) <= 0) { // If we're only labelling Tokens within baseSentenceAnnotationType, // don't add the sentence if the Tokens aren't within the span of // baseSentenceAnnotationType if(currentToken.withinSpanOf(currentSentence)) { tokensInCurrentSentence.add(currentToken); // build a stanford nlp representation of the token and add it to the sequence CoreLabel currentLabel = new CoreLabel(); currentLabel.setWord((String)currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME)); sentenceForTagger.add(currentLabel); } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } // if the sentence doesn't contain any tokens (which is a bit weird but // is possible) then don't try running the labeller if(sentenceForTagger.isEmpty()) continue; // run the labeller List<CoreLabel> taggerResults = tagger.classifySentence(sentenceForTagger); // add the results // make sure no malfunction occurred if(taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException( "NER labeller malfunction: the output size (" + taggerResults.size() + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!"); // proceed through the annotated sequence Iterator<CoreLabel> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); String previousLabel = outsideLabel; Long previousEnd = new Long(-1); Long entityStart = new Long(-1); //No idea why this was there so lets comment it out //Long entityEnd = new Long(-1); Annotation annot; String nerLabel = ""; while(resIter.hasNext()) { // for each labelled token.. annot = tokIter.next(); CoreLabel word = resIter.next(); nerLabel = word.get(CoreAnnotations.AnswerAnnotation.class); // falling edge transition: entity ends // guard against this triggering at document start if (!nerLabel.equals(previousLabel) && !previousLabel.equals(outsideLabel) && entityStart != -1) { // System.out.println("falling edge"); // get final bound; add new annotation in output AS try { outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl()); } catch (InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } } // rising edge transition: entity starts if (!nerLabel.equals(previousLabel) && !nerLabel.equals(outsideLabel)) { // System.out.println("rising edge"); entityStart = annot.getStartNode().getOffset(); } // System.out.println(word.word() + "/" + nerLabel); previousLabel = nerLabel; previousEnd = annot.getEndNode().getOffset(); } // clean up, in case last token in sentence was in an entity if (!nerLabel.equals(outsideLabel)) { try { outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl()); } catch (InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } } fireProgressChanged(sentIndex++ * 100 / sentCnt); } fireProcessFinished(); fireStatusChanged(document.getName() + " tagged in " + NumberFormat.getInstance().format( (double)(System.currentTimeMillis() - startTime) / 1000) + " seconds!"); } else { if(failOnMissingInputAnnotations) { throw new ExecutionException( "No sentences or tokens to process in document " + document.getName() + "\n" + "Please run a sentence splitter " + "and tokeniser first!"); } else { Utils .logOnce( logger, Level.INFO, "NE labeller: no sentence or token annotations in input document - see debug log for details."); logger.debug("No input annotations in document " + document.getName()); } } } public void setEncoding(String encoding) { this.encoding = encoding; } @Optional @RunTime @CreoleParameter(comment = "Input annotation set name", defaultValue = "") public void setInputASName(String newInputASName) { inputASName = newInputASName; } public String getInputASName() { return inputASName; } public String getEncoding() { return this.encoding; } public String getBaseTokenAnnotationType() { return this.baseTokenAnnotationType; } public String getBaseSentenceAnnotationType() { return this.baseSentenceAnnotationType; } @RunTime @CreoleParameter(comment = "Annotation type for what should be considered as atomic words to label with NEs", defaultValue = "Token") public void setBaseTokenAnnotationType(String baseTokenAnnotationType) { this.baseTokenAnnotationType = baseTokenAnnotationType; } @RunTime @CreoleParameter(comment = "Sentence-level annotation type", defaultValue = "Sentence") public void setBaseSentenceAnnotationType(String baseSentenceAnnotationtype) { this.baseSentenceAnnotationType = baseSentenceAnnotationtype; } public String getOutputASName() { return this.outputASName; } @Optional @RunTime @CreoleParameter(comment = "Output annotation set name", defaultValue = "") public void setOutputASName(String outputASName) { this.outputASName = outputASName; } @RunTime @CreoleParameter(comment = "Label used by model for tokens outside entities", defaultValue = "O") public void setOutsideLabel(String outsideLabel) { this.outsideLabel = outsideLabel; } public String getOutsideLabel() { return this.outsideLabel; } @CreoleParameter(comment = "Path to the NER model file", defaultValue = "resources/english.all.3class.distsim.crf.ser.gz", suffixes="tagger;model;gz") public void setModelFile(URL modelFile) { this.modelFile = modelFile; } public URL getModelFile() { return this.modelFile; } /** * For internal use by the duplication mechanism only. */ @Sharable public void setTagger(AbstractSequenceClassifier<CoreLabel> tagger) { this.tagger = tagger; } /** * For internal use by the duplication mechanism only. */ public AbstractSequenceClassifier<CoreLabel> getTagger() { return this.tagger; } protected AbstractSequenceClassifier<CoreLabel> tagger; private String inputASName; private String encoding; private String baseTokenAnnotationType; private String baseSentenceAnnotationType; private String outputASName; private String outsideLabel; private URL modelFile; }