POSTagger.java
001 /*
002  *  Copyright (c) 1995-2012, The University of Sheffield. See the file
003  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
004  *
005  *  This file is part of GATE (see http://gate.ac.uk/), and is free
006  *  software, licenced under the GNU Library General Public License,
007  *  Version 2, June 1991 (in the distribution as file licence.html,
008  *  and also available at http://gate.ac.uk/gate/licence.html).
009  *
010  *  Valentin Tablan, 01 Feb 2000
011  *
012  *  $Id: POSTagger.java 17699 2014-03-19 09:11:55Z markagreenwood $
013  */
014 
015 package gate.creole;
016 
017 import gate.Annotation;
018 import gate.AnnotationSet;
019 import gate.Factory;
020 import gate.FeatureMap;
021 import gate.Resource;
022 import gate.Utils;
023 import gate.creole.metadata.CreoleParameter;
024 import gate.creole.metadata.CreoleResource;
025 import gate.creole.metadata.Optional;
026 import gate.creole.metadata.RunTime;
027 import gate.util.GateRuntimeException;
028 import gate.util.OffsetComparator;
029 
030 import java.text.NumberFormat;
031 import java.util.ArrayList;
032 import java.util.Collections;
033 import java.util.Comparator;
034 import java.util.Iterator;
035 import java.util.List;
036 import java.util.ListIterator;
037 
038 import org.apache.log4j.Level;
039 import org.apache.log4j.Logger;
040 /**
041  * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
042  */
043 @CreoleResource(name = "ANNIE POS Tagger",
044         helpURL = "http://gate.ac.uk/userguide/sec:annie:tagger",
045         comment = "Mark Hepple's Brill-style POS tagger", icon="pos-tagger")
046 public class POSTagger extends AbstractLanguageAnalyser {
047 
048   private static final long serialVersionUID = 7680938864165071808L;
049 
050   public static final String
051     TAG_DOCUMENT_PARAMETER_NAME = "document";
052 
053   public static final String
054     TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
055 
056   public static final String
057     TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
058 
059   public static final String
060     TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
061 
062   public static final String
063       TAG_ENCODING_PARAMETER_NAME = "encoding";
064 
065   
066   public static final String
067     BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME = "baseTokenAnnotationType";
068 
069   public static final String
070   OUTPUT_ANNOTATION_TYPE_PARAMETER_NAME = "outputAnnotationType";
071   
072   public static final String
073   BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME = "baseSentenceAnnotationType";
074 
075   public static final String
076     TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";
077 
078   @RunTime
079   @Optional
080   @CreoleParameter(
081     comment = "Throw an exception when there are none of the required input annotations",
082     defaultValue = "true")  
083   public void setFailOnMissingInputAnnotations(Boolean fail) {
084     failOnMissingInputAnnotations = fail;
085   }
086   public Boolean getFailOnMissingInputAnnotations() {
087     return failOnMissingInputAnnotations;
088   }
089   protected Boolean failOnMissingInputAnnotations = true;
090   
091   
092   @RunTime
093   @Optional
094   @CreoleParameter(
095     comment = "Should all Tokens be POS tagged or just those within baseSentenceAnnotationType?",
096     defaultValue = "true")  
097   public void setPosTagAllTokens(Boolean allTokens) {
098     posTagAllTokens = allTokens;
099   }
100   public Boolean getPosTagAllTokens() {
101     return posTagAllTokens;
102   }
103   protected Boolean posTagAllTokens = true;  // should all Tokens be POS tagged or just those within baseSentenceAnnotationType
104 
105   public POSTagger() {
106   }
107 
108   protected Logger logger = Logger.getLogger(this.getClass().getName());
109   
110   @Override
111   public Resource init()throws ResourceInstantiationException{
112     if(lexiconURL == null){
113       throw new ResourceInstantiationException(
114         "NoURL provided for the lexicon!");
115     }
116     if(rulesURL == null){
117       throw new ResourceInstantiationException(
118         "No URL provided for the rules!");
119     }
120     try{
121       tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL, encoding);
122     }catch(Exception e){
123       throw new ResourceInstantiationException(e);
124     }
125     return this;
126   }
127 
128 
129   @Override
130   public void execute() throws ExecutionException{
131     //check the parameters
132     if(document == nullthrow new ExecutionException(
133       "No document to process!");
134     if(inputASName != null && inputASName.equals("")) inputASName = null;
135     AnnotationSet inputAS = (inputASName == null?
136                             document.getAnnotations() :
137                             document.getAnnotations(inputASName);
138 
139                            
140     if(baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length()==0) {
141         throw new ExecutionException("No base Token Annotation Type provided!");
142     }
143 
144     if(outputASName != null && outputASName.equals("")) outputASName = null;
145         
146     if(baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length()==0) {
147         throw new ExecutionException("No base Sentence Annotation Type provided!");
148     }
149     
150     if(outputAnnotationType == null || outputAnnotationType.trim().length()==0) {
151         throw new ExecutionException("No AnnotationType provided to store the new feature!");
152     }
153 
154     AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
155     AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
156     if(sentencesAS != null && sentencesAS.size() 0
157        && tokensAS != null && tokensAS.size() 0){
158       long startTime = System.currentTimeMillis();
159       fireStatusChanged("POS tagging " + document.getName());
160       fireProgressChanged(0);
161       //prepare the input for HepTag
162       List<String> sentenceForTagger = new ArrayList<String>();
163       List<List<String>> sentencesForTagger = new ArrayList<List<String>>(1);
164       sentencesForTagger.add(sentenceForTagger);
165 
166       //define a comparator for annotations by start offset
167       Comparator<Annotation> offsetComparator = new OffsetComparator();
168 
169       //read all the tokens and all the sentences
170       List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS);
171       Collections.sort(sentencesList, offsetComparator);
172       List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS);
173       Collections.sort(tokensList, offsetComparator);
174 
175       Iterator<Annotation> sentencesIter = sentencesList.iterator();
176       ListIterator<Annotation> tokensIter = tokensList.listIterator();
177 
178       List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>();
179       Annotation currentToken = tokensIter.next();
180       int sentIndex = 0;
181       int sentCnt = sentencesAS.size();
182       while(sentencesIter.hasNext()){
183         Annotation currentSentence = sentencesIter.next();
184         tokensInCurrentSentence.clear();
185         sentenceForTagger.clear();
186         while(currentToken != null
187               &&
188               currentToken.getEndNode().getOffset().compareTo(
189               currentSentence.getEndNode().getOffset()) <= 0){
190           // If we're only POS tagging Tokens within baseSentenceAnnotationType, don't add the sentence if the Tokens aren't within the span of baseSentenceAnnotationType
191           if (posTagAllTokens || currentToken.withinSpanOf(currentSentence)) {
192             tokensInCurrentSentence.add(currentToken);
193             sentenceForTagger.add((String)currentToken.getFeatures().
194                                 get(TOKEN_STRING_FEATURE_NAME));
195           }
196           currentToken = (tokensIter.hasNext() ?
197                                      tokensIter.next() null);
198         }
199         //run the POS tagger
200         List<List<String[]>> taggerList = tagger.runTagger(sentencesForTagger);
201         if(taggerList != null && taggerList.size() 0){
202           List<String[]> taggerResults = taggerList.get(0);
203           //add the results
204           //make sure no malfunction occurred
205           if(taggerResults.size() != tokensInCurrentSentence.size())
206             throw new ExecutionException(
207                 "POS Tagger malfunction: the output size (" +
208                 taggerResults.size() +
209                 ") is different from the input size (" +
210                 tokensInCurrentSentence.size() ")!");
211           Iterator<String[]> resIter = taggerResults.iterator();
212           Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
213           while(resIter.hasNext()){
214               Annotation annot = tokIter.next();
215               addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, resIter.next()[1]);
216           }
217         }
218         fireProgressChanged(sentIndex++ * 100 / sentCnt);
219       }//while(sentencesIter.hasNext())
220 
221       if(currentToken != null && posTagAllTokens){ // Tag remaining Tokens if we are not considering those only within baseSentenceAnnotationType
222         //we have remaining tokens after the last sentence
223         tokensInCurrentSentence.clear();
224         sentenceForTagger.clear();
225         while(currentToken != null){
226           tokensInCurrentSentence.add(currentToken);
227           sentenceForTagger.add((String)currentToken.getFeatures().
228                                 get(TOKEN_STRING_FEATURE_NAME));
229           currentToken = (tokensIter.hasNext() ?
230                                       tokensIter.next() null);
231         }
232         //run the POS tagger
233         List<String[]> taggerResults = tagger.runTagger(sentencesForTagger).get(0);
234         //add the results
235         //make sure no malfunction occurred
236         if(taggerResults.size() != tokensInCurrentSentence.size())
237           throw new ExecutionException(
238               "POS Tagger malfunction: the output size (" +
239               taggerResults.size() +
240               ") is different from the input size (" +
241               tokensInCurrentSentence.size() ")!");
242         Iterator<String[]> resIter = taggerResults.iterator();
243         Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
244         while(resIter.hasNext()){
245             Annotation annot = tokIter.next();
246             addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, resIter.next()[1]);
247         }
248       }//if(currentToken != null)
249       fireProcessFinished();
250       fireStatusChanged(
251         document.getName() " tagged in " +
252         NumberFormat.getInstance().format(
253         (double)(System.currentTimeMillis() - startTime1000+
254         " seconds!");
255     }else{
256       if(failOnMissingInputAnnotations) {
257         throw new ExecutionException("No sentences or tokens to process in document "+document.getName()+"\n" +
258                                      "Please run a sentence splitter "+
259                                      "and tokeniser first!");
260       else {
261         Utils.logOnce(logger,Level.INFO,"POS tagger: no sentence or token annotations in input document - see debug log for details.");
262         logger.debug("No input annotations in document "+document.getName());
263       }
264     }
265 
266 //OLD version
267 /*
268     AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
269     if(as != null && as.size() > 0){
270       List sentences = new ArrayList(as);
271       Collections.sort(sentences, offsetComparator);
272       Iterator sentIter = sentences.iterator();
273       int sentIndex = 0;
274       int sentCnt = sentences.size();
275       long startTime= System.currentTimeMillis();
276       while(sentIter.hasNext()){
277 start = System.currentTimeMillis();
278         Annotation sentenceAnn = (Annotation)sentIter.next();
279         AnnotationSet rangeSet = inputAS.get(
280                                   sentenceAnn.getStartNode().getOffset(),
281                                   sentenceAnn.getEndNode().getOffset());
282         if(rangeSet == null) continue;
283         AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
284         if(tokensSet == null) continue;
285         List tokens = new ArrayList(tokensSet);
286         Collections.sort(tokens, offsetComparator);
287 
288 //          List tokens = (List)sentenceAnn.getFeatures().get("tokens");
289         List sentence = new ArrayList(tokens.size());
290         Iterator tokIter = tokens.iterator();
291         while(tokIter.hasNext()){
292           Annotation token = (Annotation)tokIter.next();
293           String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
294           sentence.add(text);
295         }//while(tokIter.hasNext())
296 
297         //run the POSTagger over this sentence
298         List sentences4tagger = new ArrayList(1);
299         sentences4tagger.add(sentence);
300 prepTime += System.currentTimeMillis() - start;
301 start = System.currentTimeMillis();
302         List taggerResults = tagger.runTagger(sentences4tagger);
303 posTime += System.currentTimeMillis() - start;
304 start = System.currentTimeMillis();
305         //add the results to the output annotation set
306         //we only get one sentence
307         List sentenceFromTagger = (List)taggerResults.get(0);
308         if(sentenceFromTagger.size() != sentence.size()){
309           String taggerResult = "";
310           for(int i = 0; i< sentenceFromTagger.size(); i++){
311             taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
312           }
313           throw new GateRuntimeException(
314             "POS Tagger malfunction: the output size (" +
315             sentenceFromTagger.size() +
316             ") is different from the input size (" +
317             sentence.size() + ")!" +
318             "\n Input: " + sentence + "\nOutput: " + taggerResult);
319         }
320         for(int i = 0; i< sentence.size(); i++){
321           String category = ((String[])sentenceFromTagger.get(i))[1];
322           Annotation token = (Annotation)tokens.get(i);
323           token.getFeatures().
324             put(TOKEN_CATEGORY_FEATURE_NAME, category);
325         }//for(i = 0; i<= sentence.size(); i++)
326 postTime += System.currentTimeMillis() - start;
327         fireProgressChanged(sentIndex++ * 100 / sentCnt);
328       }//while(sentIter.hasNext())
329 Out.prln("POS preparation time:" + prepTime);
330 Out.prln("POS execution time:" + posTime);
331 Out.prln("POS after execution time:" + postTime);
332         fireProcessFinished();
333         long endTime = System.currentTimeMillis();
334         fireStatusChanged(document.getName() + " tagged in " +
335                         NumberFormat.getInstance().format(
336                         (double)(endTime - startTime) / 1000) + " seconds!");
337     }else{
338       throw new GateRuntimeException("No sentences to process!\n" +
339                                      "Please run a sentence splitter first!");
340     }//if(as != null && as.size() > 0)
341 */
342   }
343 
344 
345   protected void addFeatures(Annotation annot, String featureName, String featureValuethrows GateRuntimeException {
346       String tempIASN = inputASName == null "" : inputASName;
347       String tempOASN = outputASName == null "" : outputASName;
348       if(outputAnnotationType.equals(baseTokenAnnotationType&& tempIASN.equals(tempOASN)) {
349           annot.getFeatures().put(featureName, featureValue);
350           return;
351       else {
352           int start = annot.getStartNode().getOffset().intValue();
353           int end = annot.getEndNode().getOffset().intValue();
354           
355           // get the annotations of type outputAnnotationType
356           AnnotationSet outputAS = (outputASName == null?
357                   document.getAnnotations() :
358                   document.getAnnotations(outputASName);
359           AnnotationSet annotations = outputAS.get(outputAnnotationType);
360           if(annotations == null || annotations.size() == 0) {
361               // add new annotation
362               FeatureMap features = Factory.newFeatureMap();
363               features.put(featureName, featureValue);
364               try {
365                   outputAS.add(new Long(start)new Long(end), outputAnnotationType, features);
366               catch(Exception e) {
367                   throw new GateRuntimeException("Invalid Offsets");
368               }
369           else {
370               // search for the annotation if there is one with the same start and end offsets
371               List<Annotation> tempList = new ArrayList<Annotation>(annotations.get());
372               boolean found = false;
373               for(int i=0;i<tempList.size();i++) {
374                   Annotation annotation = tempList.get(i);
375                   if(annotation.getStartNode().getOffset().intValue() == start && annotation.getEndNode().getOffset().intValue() == end) {
376                       // this is the one
377                       annotation.getFeatures().put(featureName, featureValue);
378                       found = true;
379                       break;
380                   }
381               }
382               
383               if(!found) {
384                   // add new annotation
385                   FeatureMap features = Factory.newFeatureMap();
386                   features.put(featureName, featureValue);
387                   try {
388                       outputAS.add(new Long(start)new Long(end), outputAnnotationType, features);
389                   catch(Exception e) {
390                       throw new GateRuntimeException("Invalid Offsets");
391                   }
392               }
393           }
394       }
395   }
396   
397   @Optional
398   @CreoleParameter(comment="The URL to the lexicon file", defaultValue="resources/heptag/lexicon")
399   public void setLexiconURL(java.net.URL newLexiconURL) {
400     lexiconURL = newLexiconURL;
401   }
402   public java.net.URL getLexiconURL() {
403     return lexiconURL;
404   }
405   
406   @Optional
407   @CreoleParameter(comment="The URL to the ruleset file", defaultValue="resources/heptag/ruleset")
408   public void setRulesURL(java.net.URL newRulesURL) {
409     rulesURL = newRulesURL;
410   }
411   
412   @Optional
413   @CreoleParameter(comment="The encoding used for reading rules and lexicons")
414   public void setEncoding(String encoding) {
415     this.encoding = encoding;
416   }
417 
418   public java.net.URL getRulesURL() {
419     return rulesURL;
420   }
421   
422   @RunTime
423   @Optional
424   @CreoleParameter(comment="The annotation set to be used as input that must contain 'Token' and 'Sentence' annotations")
425   public void setInputASName(String newInputASName) {
426     inputASName = newInputASName;
427   }
428   public String getInputASName() {
429     return inputASName;
430   }
431   public String getEncoding() {
432     return this.encoding;
433   }
434 
435   public String getBaseTokenAnnotationType() {
436       return this.baseTokenAnnotationType;
437   }
438   
439   public String getBaseSentenceAnnotationType() {
440       return this.baseSentenceAnnotationType;
441   }
442   
443   public String getOutputAnnotationType() {
444       return this.outputAnnotationType;
445   }
446   
447   @RunTime
448   @CreoleParameter(comment="The name of the base 'Token' annotation type", defaultValue="Token")
449   public void setBaseTokenAnnotationType(String baseTokenAnnotationType) {
450       this.baseTokenAnnotationType = baseTokenAnnotationType;
451   }
452   
453   @RunTime
454   @CreoleParameter(comment="The name of the base 'Sentence' annotation type", defaultValue="Sentence")
455   public void setBaseSentenceAnnotationType(String baseSentenceAnnotationtype) {
456       this.baseSentenceAnnotationType = baseSentenceAnnotationtype;
457   }
458   
459   @RunTime
460   @CreoleParameter(comment="The name of the annotation type where the new features should be added", defaultValue="Token")
461   public void setOutputAnnotationType(String outputAnnotationType) {
462       this.outputAnnotationType = outputAnnotationType;
463   }
464   
465   public String getOutputASName() {
466       return this.outputASName;
467   }
468   
469   @RunTime
470   @Optional
471   @CreoleParameter(comment="The annotation set to be used as output for POS annotations")
472   public void setOutputASName(String outputASName) {
473       this.outputASName = outputASName;
474   }
475   
476   protected hepple.postag.POSTagger tagger;
477   private java.net.URL lexiconURL;
478   private java.net.URL rulesURL;
479   private String inputASName;
480   private String encoding;
481   private String baseTokenAnnotationType;
482   private String baseSentenceAnnotationType;
483   private String outputAnnotationType;
484   private String outputASName;
485 }