1   /*
2    *  Copyright (c) 1998-2004, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: POSTagger.java,v 1.22 2004/12/01 15:34:54 niraj Exp $
12   */
13  
14  package gate.creole;
15  
16  import java.text.NumberFormat;
17  import java.util.*;
18  
19  import gate.*;
20  import gate.util.GateRuntimeException;
21  import gate.util.OffsetComparator;
22  /**
23   * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
24   */
25  public class POSTagger extends AbstractLanguageAnalyser {
26  
27    public static final String
28      TAG_DOCUMENT_PARAMETER_NAME = "document";
29  
30    public static final String
31      TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
32  
33    public static final String
34      TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
35  
36    public static final String
37      TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
38  
39    public static final String
40        TAG_ENCODING_PARAMETER_NAME = "encoding";
41  
42    public POSTagger() {
43    }
44  
45    public Resource init()throws ResourceInstantiationException{
46      if(lexiconURL == null){
47        throw new ResourceInstantiationException(
48          "NoURL provided for the lexicon!");
49      }
50      if(rulesURL == null){
51        throw new ResourceInstantiationException(
52          "No URL provided for the rules!");
53      }
54      try{
55        tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
56      }catch(Exception e){
57        throw new ResourceInstantiationException(e);
58      }
59      return this;
60    }
61  
62  
63    public void execute() throws ExecutionException{
64      try{
65        //check the parameters
66        if(document == null) throw new GateRuntimeException(
67          "No document to process!");
68        if(inputASName != null && inputASName.equals("")) inputASName = null;
69        AnnotationSet inputAS = (inputASName == null) ?
70                                document.getAnnotations() :
71                                document.getAnnotations(inputASName);
72  
73  
74        AnnotationSet sentencesAS = inputAS.get(SENTENCE_ANNOTATION_TYPE);
75        AnnotationSet tokensAS = inputAS.get(TOKEN_ANNOTATION_TYPE);
76        if(sentencesAS != null && sentencesAS.size() > 0
77           && tokensAS != null && tokensAS.size() > 0){
78          long startTime = System.currentTimeMillis();
79          fireStatusChanged("POS tagging " + document.getName());
80          fireProgressChanged(0);
81          //prepare the input for HepTag
82          List sentenceForTagger = new ArrayList();
83          List sentencesForTagger = new ArrayList(1);
84          sentencesForTagger.add(sentenceForTagger);
85  
86          //define a comparator for annotations by start offset
87          Comparator offsetComparator = new OffsetComparator();
88  
89          //read all the tokens and all the sentences
90          List sentencesList = new ArrayList(sentencesAS);
91          Collections.sort(sentencesList, offsetComparator);
92          List tokensList = new ArrayList(tokensAS);
93          Collections.sort(tokensList, offsetComparator);
94  
95          Iterator sentencesIter = sentencesList.iterator();
96          ListIterator tokensIter = tokensList.listIterator();
97  
98          List tokensInCurrentSentence = new ArrayList();
99          Annotation currentToken = (Annotation)tokensIter.next();
100         int sentIndex = 0;
101         int sentCnt = sentencesAS.size();
102         while(sentencesIter.hasNext()){
103           Annotation currentSentence = (Annotation)sentencesIter.next();
104           tokensInCurrentSentence.clear();
105           sentenceForTagger.clear();
106           while(currentToken != null
107                 &&
108                 currentToken.getEndNode().getOffset().compareTo(
109                 currentSentence.getEndNode().getOffset()) <= 0){
110             tokensInCurrentSentence.add(currentToken);
111             sentenceForTagger.add(currentToken.getFeatures().
112                                   get(TOKEN_STRING_FEATURE_NAME));
113             currentToken = (Annotation)(tokensIter.hasNext() ?
114                                        tokensIter.next() : null);
115           }
116           tagger.setEncoding(this.encoding);
117           //run the POS tagger
118           List taggerList = tagger.runTagger(sentencesForTagger);
119           if(taggerList != null && taggerList.size() > 0){
120             List taggerResults = (List) taggerList.get(0);
121             //add the results
122             //make sure no malfunction occurred
123             if(taggerResults.size() != tokensInCurrentSentence.size())
124               throw new GateRuntimeException(
125                   "POS Tagger malfunction: the output size (" +
126                   taggerResults.size() +
127                   ") is different from the input size (" +
128                   tokensInCurrentSentence.size() + ")!");
129             Iterator resIter = taggerResults.iterator();
130             Iterator tokIter = tokensInCurrentSentence.iterator();
131             while(resIter.hasNext()){
132               ((Annotation)tokIter.next()).getFeatures().
133                 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
134             }
135           }
136           fireProgressChanged(sentIndex++ * 100 / sentCnt);
137         }//while(sentencesIter.hasNext())
138         if(currentToken != null){
139           //we have remaining tokens after the last sentence
140           tokensInCurrentSentence.clear();
141           sentenceForTagger.clear();
142           while(currentToken != null){
143             tokensInCurrentSentence.add(currentToken);
144             sentenceForTagger.add(currentToken.getFeatures().
145                                   get(TOKEN_STRING_FEATURE_NAME));
146             currentToken = (Annotation)(tokensIter.hasNext() ?
147                                         tokensIter.next() : null);
148           }
149           //run the POS tagger
150           List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
151           //add the results
152           //make sure no malfunction accured
153           if(taggerResults.size() != tokensInCurrentSentence.size())
154             throw new GateRuntimeException(
155                 "POS Tagger malfunction: the output size (" +
156                 taggerResults.size() +
157                 ") is different from the input size (" +
158                 tokensInCurrentSentence.size() + ")!");
159           Iterator resIter = taggerResults.iterator();
160           Iterator tokIter = tokensInCurrentSentence.iterator();
161           while(resIter.hasNext()){
162             ((Annotation)tokIter.next()).getFeatures().
163               put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
164           }
165         }//if(currentToken != null)
166         fireProcessFinished();
167         fireStatusChanged(
168           document.getName() + " tagged in " +
169           NumberFormat.getInstance().format(
170           (double)(System.currentTimeMillis() - startTime) / 1000) +
171           " seconds!");
172       }else{
173         throw new GateRuntimeException("No sentences or tokens to process!\n" +
174                                        "Please run a sentence splitter "+
175                                        "and tokeniser first!");
176       }
177 
178 //OLD version
179 /*
180       AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
181       if(as != null && as.size() > 0){
182         List sentences = new ArrayList(as);
183         Collections.sort(sentences, offsetComparator);
184         Iterator sentIter = sentences.iterator();
185         int sentIndex = 0;
186         int sentCnt = sentences.size();
187         long startTime= System.currentTimeMillis();
188         while(sentIter.hasNext()){
189 start = System.currentTimeMillis();
190           Annotation sentenceAnn = (Annotation)sentIter.next();
191           AnnotationSet rangeSet = inputAS.get(
192                                     sentenceAnn.getStartNode().getOffset(),
193                                     sentenceAnn.getEndNode().getOffset());
194           if(rangeSet == null) continue;
195           AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
196           if(tokensSet == null) continue;
197           List tokens = new ArrayList(tokensSet);
198           Collections.sort(tokens, offsetComparator);
199 
200 //          List tokens = (List)sentenceAnn.getFeatures().get("tokens");
201           List sentence = new ArrayList(tokens.size());
202           Iterator tokIter = tokens.iterator();
203           while(tokIter.hasNext()){
204             Annotation token = (Annotation)tokIter.next();
205             String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
206             sentence.add(text);
207           }//while(tokIter.hasNext())
208 
209           //run the POSTagger over this sentence
210           List sentences4tagger = new ArrayList(1);
211           sentences4tagger.add(sentence);
212 prepTime += System.currentTimeMillis() - start;
213 start = System.currentTimeMillis();
214           List taggerResults = tagger.runTagger(sentences4tagger);
215 posTime += System.currentTimeMillis() - start;
216 start = System.currentTimeMillis();
217           //add the results to the output annotation set
218           //we only get one sentence
219           List sentenceFromTagger = (List)taggerResults.get(0);
220           if(sentenceFromTagger.size() != sentence.size()){
221             String taggerResult = "";
222             for(int i = 0; i< sentenceFromTagger.size(); i++){
223               taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
224             }
225             throw new GateRuntimeException(
226               "POS Tagger malfunction: the output size (" +
227               sentenceFromTagger.size() +
228               ") is different from the input size (" +
229               sentence.size() + ")!" +
230               "\n Input: " + sentence + "\nOutput: " + taggerResult);
231           }
232           for(int i = 0; i< sentence.size(); i++){
233             String category = ((String[])sentenceFromTagger.get(i))[1];
234             Annotation token = (Annotation)tokens.get(i);
235             token.getFeatures().
236               put(TOKEN_CATEGORY_FEATURE_NAME, category);
237           }//for(i = 0; i<= sentence.size(); i++)
238 postTime += System.currentTimeMillis() - start;
239           fireProgressChanged(sentIndex++ * 100 / sentCnt);
240         }//while(sentIter.hasNext())
241 Out.prln("POS preparation time:" + prepTime);
242 Out.prln("POS execution time:" + posTime);
243 Out.prln("POS after execution time:" + postTime);
244           fireProcessFinished();
245           long endTime = System.currentTimeMillis();
246           fireStatusChanged(document.getName() + " tagged in " +
247                           NumberFormat.getInstance().format(
248                           (double)(endTime - startTime) / 1000) + " seconds!");
249       }else{
250         throw new GateRuntimeException("No sentences to process!\n" +
251                                        "Please run a sentence splitter first!");
252       }//if(as != null && as.size() > 0)
253 */
254     }catch(Exception e){
255       throw new ExecutionException(e);
256     }
257   }
258 
259 
260   public void setLexiconURL(java.net.URL newLexiconURL) {
261     lexiconURL = newLexiconURL;
262   }
263   public java.net.URL getLexiconURL() {
264     return lexiconURL;
265   }
266   public void setRulesURL(java.net.URL newRulesURL) {
267     rulesURL = newRulesURL;
268   }
269   public void setEncoding(String encoding) {
270     this.encoding = encoding;
271   }
272 
273   public java.net.URL getRulesURL() {
274     return rulesURL;
275   }
276   public void setInputASName(String newInputASName) {
277     inputASName = newInputASName;
278   }
279   public String getInputASName() {
280     return inputASName;
281   }
282   public String getEncoding() {
283     return this.encoding;
284   }
285 
286   protected hepple.postag.POSTagger tagger;
287   private java.net.URL lexiconURL;
288   private java.net.URL rulesURL;
289   private String inputASName;
290   private String encoding;
291 }
292