1   /*
2    *  Copyright (c) 1998-2004, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: POSTagger.java,v 1.20 2004/07/21 17:10:03 akshay Exp $
12   */
13  
14  package gate.creole;
15  
16  import java.text.NumberFormat;
17  import java.util.*;
18  
19  import gate.*;
20  import gate.util.GateRuntimeException;
21  import gate.util.OffsetComparator;
22  /**
23   * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
24   */
25  public class POSTagger extends AbstractLanguageAnalyser {
26  
27    public static final String
28      TAG_DOCUMENT_PARAMETER_NAME = "document";
29  
30    public static final String
31      TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
32  
33    public static final String
34      TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
35  
36    public static final String
37      TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
38  
39    public POSTagger() {
40    }
41  
42    public Resource init()throws ResourceInstantiationException{
43      if(lexiconURL == null){
44        throw new ResourceInstantiationException(
45          "NoURL provided for the lexicon!");
46      }
47      if(rulesURL == null){
48        throw new ResourceInstantiationException(
49          "No URL provided for the rules!");
50      }
51      try{
52        tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
53      }catch(Exception e){
54        throw new ResourceInstantiationException(e);
55      }
56      return this;
57    }
58  
59  
60    public void execute() throws ExecutionException{
61      try{
62        //check the parameters
63        if(document == null) throw new GateRuntimeException(
64          "No document to process!");
65        if(inputASName != null && inputASName.equals("")) inputASName = null;
66        AnnotationSet inputAS = (inputASName == null) ?
67                                document.getAnnotations() :
68                                document.getAnnotations(inputASName);
69  
70  
71        AnnotationSet sentencesAS = inputAS.get(SENTENCE_ANNOTATION_TYPE);
72        AnnotationSet tokensAS = inputAS.get(TOKEN_ANNOTATION_TYPE);
73        if(sentencesAS != null && sentencesAS.size() > 0
74           && tokensAS != null && tokensAS.size() > 0){
75          long startTime = System.currentTimeMillis();
76          fireStatusChanged("POS tagging " + document.getName());
77          fireProgressChanged(0);
78          //prepare the input for HepTag
79          List sentenceForTagger = new ArrayList();
80          List sentencesForTagger = new ArrayList(1);
81          sentencesForTagger.add(sentenceForTagger);
82  
83          //define a comparator for annotations by start offset
84          Comparator offsetComparator = new OffsetComparator();
85  
86          //read all the tokens and all the sentences
87          List sentencesList = new ArrayList(sentencesAS);
88          Collections.sort(sentencesList, offsetComparator);
89          List tokensList = new ArrayList(tokensAS);
90          Collections.sort(tokensList, offsetComparator);
91  
92          Iterator sentencesIter = sentencesList.iterator();
93          ListIterator tokensIter = tokensList.listIterator();
94  
95          List tokensInCurrentSentence = new ArrayList();
96          Annotation currentToken = (Annotation)tokensIter.next();
97          int sentIndex = 0;
98          int sentCnt = sentencesAS.size();
99          while(sentencesIter.hasNext()){
100           Annotation currentSentence = (Annotation)sentencesIter.next();
101           tokensInCurrentSentence.clear();
102           sentenceForTagger.clear();
103           while(currentToken != null
104                 &&
105                 currentToken.getEndNode().getOffset().compareTo(
106                 currentSentence.getEndNode().getOffset()) <= 0){
107             tokensInCurrentSentence.add(currentToken);
108             sentenceForTagger.add(currentToken.getFeatures().
109                                   get(TOKEN_STRING_FEATURE_NAME));
110             currentToken = (Annotation)(tokensIter.hasNext() ?
111                                        tokensIter.next() : null);
112           }
113           //run the POS tagger
114           List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
115           //add the results
116           //make sure no malfunction accured
117           if(taggerResults.size() != tokensInCurrentSentence.size())
118             throw new GateRuntimeException(
119                 "POS Tagger malfunction: the output size (" +
120                 taggerResults.size() +
121                 ") is different from the input size (" +
122                 tokensInCurrentSentence.size() + ")!");
123           Iterator resIter = taggerResults.iterator();
124           Iterator tokIter = tokensInCurrentSentence.iterator();
125           while(resIter.hasNext()){
126             ((Annotation)tokIter.next()).getFeatures().
127               put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
128           }
129           fireProgressChanged(sentIndex++ * 100 / sentCnt);
130         }//while(sentencesIter.hasNext())
131         if(currentToken != null){
132           //we have remaining tokens after the last sentence
133           tokensInCurrentSentence.clear();
134           sentenceForTagger.clear();
135           while(currentToken != null){
136             tokensInCurrentSentence.add(currentToken);
137             sentenceForTagger.add(currentToken.getFeatures().
138                                   get(TOKEN_STRING_FEATURE_NAME));
139             currentToken = (Annotation)(tokensIter.hasNext() ?
140                                         tokensIter.next() : null);
141           }
142           //run the POS tagger
143           List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
144           //add the results
145           //make sure no malfunction accured
146           if(taggerResults.size() != tokensInCurrentSentence.size())
147             throw new GateRuntimeException(
148                 "POS Tagger malfunction: the output size (" +
149                 taggerResults.size() +
150                 ") is different from the input size (" +
151                 tokensInCurrentSentence.size() + ")!");
152           Iterator resIter = taggerResults.iterator();
153           Iterator tokIter = tokensInCurrentSentence.iterator();
154           while(resIter.hasNext()){
155             ((Annotation)tokIter.next()).getFeatures().
156               put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
157           }
158         }//if(currentToken != null)
159         fireProcessFinished();
160         fireStatusChanged(
161           document.getName() + " tagged in " +
162           NumberFormat.getInstance().format(
163           (double)(System.currentTimeMillis() - startTime) / 1000) +
164           " seconds!");
165       }else{
166         throw new GateRuntimeException("No sentences or tokens to process!\n" +
167                                        "Please run a sentence splitter "+
168                                        "and tokeniser first!");
169       }
170 
171 //OLD version
172 /*
173       AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
174       if(as != null && as.size() > 0){
175         List sentences = new ArrayList(as);
176         Collections.sort(sentences, offsetComparator);
177         Iterator sentIter = sentences.iterator();
178         int sentIndex = 0;
179         int sentCnt = sentences.size();
180         long startTime= System.currentTimeMillis();
181         while(sentIter.hasNext()){
182 start = System.currentTimeMillis();
183           Annotation sentenceAnn = (Annotation)sentIter.next();
184           AnnotationSet rangeSet = inputAS.get(
185                                     sentenceAnn.getStartNode().getOffset(),
186                                     sentenceAnn.getEndNode().getOffset());
187           if(rangeSet == null) continue;
188           AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
189           if(tokensSet == null) continue;
190           List tokens = new ArrayList(tokensSet);
191           Collections.sort(tokens, offsetComparator);
192 
193 //          List tokens = (List)sentenceAnn.getFeatures().get("tokens");
194           List sentence = new ArrayList(tokens.size());
195           Iterator tokIter = tokens.iterator();
196           while(tokIter.hasNext()){
197             Annotation token = (Annotation)tokIter.next();
198             String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
199             sentence.add(text);
200           }//while(tokIter.hasNext())
201 
202           //run the POSTagger over this sentence
203           List sentences4tagger = new ArrayList(1);
204           sentences4tagger.add(sentence);
205 prepTime += System.currentTimeMillis() - start;
206 start = System.currentTimeMillis();
207           List taggerResults = tagger.runTagger(sentences4tagger);
208 posTime += System.currentTimeMillis() - start;
209 start = System.currentTimeMillis();
210           //add the results to the output annotation set
211           //we only get one sentence
212           List sentenceFromTagger = (List)taggerResults.get(0);
213           if(sentenceFromTagger.size() != sentence.size()){
214             String taggerResult = "";
215             for(int i = 0; i< sentenceFromTagger.size(); i++){
216               taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
217             }
218             throw new GateRuntimeException(
219               "POS Tagger malfunction: the output size (" +
220               sentenceFromTagger.size() +
221               ") is different from the input size (" +
222               sentence.size() + ")!" +
223               "\n Input: " + sentence + "\nOutput: " + taggerResult);
224           }
225           for(int i = 0; i< sentence.size(); i++){
226             String category = ((String[])sentenceFromTagger.get(i))[1];
227             Annotation token = (Annotation)tokens.get(i);
228             token.getFeatures().
229               put(TOKEN_CATEGORY_FEATURE_NAME, category);
230           }//for(i = 0; i<= sentence.size(); i++)
231 postTime += System.currentTimeMillis() - start;
232           fireProgressChanged(sentIndex++ * 100 / sentCnt);
233         }//while(sentIter.hasNext())
234 Out.prln("POS preparation time:" + prepTime);
235 Out.prln("POS execution time:" + posTime);
236 Out.prln("POS after execution time:" + postTime);
237           fireProcessFinished();
238           long endTime = System.currentTimeMillis();
239           fireStatusChanged(document.getName() + " tagged in " +
240                           NumberFormat.getInstance().format(
241                           (double)(endTime - startTime) / 1000) + " seconds!");
242       }else{
243         throw new GateRuntimeException("No sentences to process!\n" +
244                                        "Please run a sentence splitter first!");
245       }//if(as != null && as.size() > 0)
246 */
247     }catch(Exception e){
248       throw new ExecutionException(e);
249     }
250   }
251 
252 
253   public void setLexiconURL(java.net.URL newLexiconURL) {
254     lexiconURL = newLexiconURL;
255   }
256   public java.net.URL getLexiconURL() {
257     return lexiconURL;
258   }
259   public void setRulesURL(java.net.URL newRulesURL) {
260     rulesURL = newRulesURL;
261   }
262   public java.net.URL getRulesURL() {
263     return rulesURL;
264   }
265   public void setInputASName(String newInputASName) {
266     inputASName = newInputASName;
267   }
268   public String getInputASName() {
269     return inputASName;
270   }
271 
272   protected hepple.postag.POSTagger tagger;
273   private java.net.URL lexiconURL;
274   private java.net.URL rulesURL;
275   private String inputASName;
276 }