1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: POSTagger.java,v 1.17 2002/04/30 10:12:06 valyt Exp $
12   */
13  
14  package gate.creole;
15  
16  import gate.*;
17  import gate.creole.*;
18  import gate.util.*;
19  import gate.event.*;
20  
21  import hepple.postag.*;
22  
23  import java.util.*;
24  import java.io.*;
25  import java.net.URL;
26  import java.text.NumberFormat;
27  /**
28   * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
29   */
30  public class POSTagger extends AbstractLanguageAnalyser {
31  
32    public static final String
33      TAG_DOCUMENT_PARAMETER_NAME = "document";
34  
35    public static final String
36      TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
37  
38    public static final String
39      TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
40  
41    public static final String
42      TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
43  
44    public POSTagger() {
45    }
46  
47    public Resource init()throws ResourceInstantiationException{
48      if(lexiconURL == null){
49        throw new ResourceInstantiationException(
50          "NoURL provided for the lexicon!");
51      }
52      if(rulesURL == null){
53        throw new ResourceInstantiationException(
54          "No URL provided for the rules!");
55      }
56      try{
57        tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
58      }catch(Exception e){
59        throw new ResourceInstantiationException(e);
60      }
61      return this;
62    }
63  
64  
65    public void execute() throws ExecutionException{
66      try{
67        //check the parameters
68        if(document == null) throw new GateRuntimeException(
69          "No document to process!");
70        if(inputASName != null && inputASName.equals("")) inputASName = null;
71        AnnotationSet inputAS = (inputASName == null) ?
72                                document.getAnnotations() :
73                                document.getAnnotations(inputASName);
74  
75  
76        AnnotationSet sentencesAS = inputAS.get(SENTENCE_ANNOTATION_TYPE);
77        if(sentencesAS != null && sentencesAS.size() > 0){
78          long startTime = System.currentTimeMillis();
79          fireStatusChanged("POS tagging " + document.getName());
80          fireProgressChanged(0);
81          //prepare the input for HepTag
82          List sentenceForTagger = new ArrayList();
83          List sentencesForTagger = new ArrayList(1);
84          sentencesForTagger.add(sentenceForTagger);
85  
86          //define a comparator for annotations by start offset
87          Comparator offsetComparator = new OffsetComparator();
88  
89          //read all the tokens and all the sentences
90          List sentencesList = new ArrayList(sentencesAS);
91          Collections.sort(sentencesList, offsetComparator);
92          List tokensList = new ArrayList(inputAS.get(TOKEN_ANNOTATION_TYPE));
93          Collections.sort(tokensList, offsetComparator);
94  
95          Iterator sentencesIter = sentencesList.iterator();
96          ListIterator tokensIter = tokensList.listIterator();
97  
98          List tokensInCurrentSentence = new ArrayList();
99          Annotation currentToken = (Annotation)tokensIter.next();
100         int sentIndex = 0;
101         int sentCnt = sentencesAS.size();
102         while(sentencesIter.hasNext()){
103           Annotation currentSentence = (Annotation)sentencesIter.next();
104           tokensInCurrentSentence.clear();
105           sentenceForTagger.clear();
106           while(currentToken != null
107                 &&
108                 currentToken.getEndNode().getOffset().compareTo(
109                 currentSentence.getEndNode().getOffset()) <= 0){
110             tokensInCurrentSentence.add(currentToken);
111             sentenceForTagger.add(currentToken.getFeatures().
112                                   get(TOKEN_STRING_FEATURE_NAME));
113             currentToken = (Annotation)(tokensIter.hasNext() ?
114                                        tokensIter.next() : null);
115           }
116           //run the POS tagger
117           List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
118           //add the results
119           //make sure no malfunction accured
120           if(taggerResults.size() != tokensInCurrentSentence.size())
121             throw new GateRuntimeException(
122                 "POS Tagger malfunction: the output size (" +
123                 taggerResults.size() +
124                 ") is different from the input size (" +
125                 tokensInCurrentSentence.size() + ")!");
126           Iterator resIter = taggerResults.iterator();
127           Iterator tokIter = tokensInCurrentSentence.iterator();
128           while(resIter.hasNext()){
129             ((Annotation)tokIter.next()).getFeatures().
130               put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
131           }
132           fireProgressChanged(sentIndex++ * 100 / sentCnt);
133         }//while(sentencesIter.hasNext())
134         if(currentToken != null){
135           //we have remaining tokens after the last sentence
136           tokensInCurrentSentence.clear();
137           sentenceForTagger.clear();
138           while(currentToken != null){
139             tokensInCurrentSentence.add(currentToken);
140             sentenceForTagger.add(currentToken.getFeatures().
141                                   get(TOKEN_STRING_FEATURE_NAME));
142             currentToken = (Annotation)(tokensIter.hasNext() ?
143                                         tokensIter.next() : null);
144           }
145           //run the POS tagger
146           List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
147           //add the results
148           //make sure no malfunction accured
149           if(taggerResults.size() != tokensInCurrentSentence.size())
150             throw new GateRuntimeException(
151                 "POS Tagger malfunction: the output size (" +
152                 taggerResults.size() +
153                 ") is different from the input size (" +
154                 tokensInCurrentSentence.size() + ")!");
155           Iterator resIter = taggerResults.iterator();
156           Iterator tokIter = tokensInCurrentSentence.iterator();
157           while(resIter.hasNext()){
158             ((Annotation)tokIter.next()).getFeatures().
159               put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
160           }
161         }//if(currentToken != null)
162         fireProcessFinished();
163         fireStatusChanged(
164           document.getName() + " tagged in " +
165           NumberFormat.getInstance().format(
166           (double)(System.currentTimeMillis() - startTime) / 1000) +
167           " seconds!");
168       }else{
169         throw new GateRuntimeException("No sentences to process!\n" +
170                                        "Please run a sentence splitter first!");
171       }
172 
173 //OLD version
174 /*
175       AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
176       if(as != null && as.size() > 0){
177         List sentences = new ArrayList(as);
178         Collections.sort(sentences, offsetComparator);
179         Iterator sentIter = sentences.iterator();
180         int sentIndex = 0;
181         int sentCnt = sentences.size();
182         long startTime= System.currentTimeMillis();
183         while(sentIter.hasNext()){
184 start = System.currentTimeMillis();
185           Annotation sentenceAnn = (Annotation)sentIter.next();
186           AnnotationSet rangeSet = inputAS.get(
187                                     sentenceAnn.getStartNode().getOffset(),
188                                     sentenceAnn.getEndNode().getOffset());
189           if(rangeSet == null) continue;
190           AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
191           if(tokensSet == null) continue;
192           List tokens = new ArrayList(tokensSet);
193           Collections.sort(tokens, offsetComparator);
194 
195 //          List tokens = (List)sentenceAnn.getFeatures().get("tokens");
196           List sentence = new ArrayList(tokens.size());
197           Iterator tokIter = tokens.iterator();
198           while(tokIter.hasNext()){
199             Annotation token = (Annotation)tokIter.next();
200             String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
201             sentence.add(text);
202           }//while(tokIter.hasNext())
203 
204           //run the POSTagger over this sentence
205           List sentences4tagger = new ArrayList(1);
206           sentences4tagger.add(sentence);
207 prepTime += System.currentTimeMillis() - start;
208 start = System.currentTimeMillis();
209           List taggerResults = tagger.runTagger(sentences4tagger);
210 posTime += System.currentTimeMillis() - start;
211 start = System.currentTimeMillis();
212           //add the results to the output annotation set
213           //we only get one sentence
214           List sentenceFromTagger = (List)taggerResults.get(0);
215           if(sentenceFromTagger.size() != sentence.size()){
216             String taggerResult = "";
217             for(int i = 0; i< sentenceFromTagger.size(); i++){
218               taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
219             }
220             throw new GateRuntimeException(
221               "POS Tagger malfunction: the output size (" +
222               sentenceFromTagger.size() +
223               ") is different from the input size (" +
224               sentence.size() + ")!" +
225               "\n Input: " + sentence + "\nOutput: " + taggerResult);
226           }
227           for(int i = 0; i< sentence.size(); i++){
228             String category = ((String[])sentenceFromTagger.get(i))[1];
229             Annotation token = (Annotation)tokens.get(i);
230             token.getFeatures().
231               put(TOKEN_CATEGORY_FEATURE_NAME, category);
232           }//for(i = 0; i<= sentence.size(); i++)
233 postTime += System.currentTimeMillis() - start;
234           fireProgressChanged(sentIndex++ * 100 / sentCnt);
235         }//while(sentIter.hasNext())
236 Out.prln("POS preparation time:" + prepTime);
237 Out.prln("POS execution time:" + posTime);
238 Out.prln("POS after execution time:" + postTime);
239           fireProcessFinished();
240           long endTime = System.currentTimeMillis();
241           fireStatusChanged(document.getName() + " tagged in " +
242                           NumberFormat.getInstance().format(
243                           (double)(endTime - startTime) / 1000) + " seconds!");
244       }else{
245         throw new GateRuntimeException("No sentences to process!\n" +
246                                        "Please run a sentence splitter first!");
247       }//if(as != null && as.size() > 0)
248 */
249     }catch(Exception e){
250       throw new ExecutionException(e);
251     }
252   }
253 
254 
255   public void setLexiconURL(java.net.URL newLexiconURL) {
256     lexiconURL = newLexiconURL;
257   }
258   public java.net.URL getLexiconURL() {
259     return lexiconURL;
260   }
261   public void setRulesURL(java.net.URL newRulesURL) {
262     rulesURL = newRulesURL;
263   }
264   public java.net.URL getRulesURL() {
265     return rulesURL;
266   }
267   public void setInputASName(String newInputASName) {
268     inputASName = newInputASName;
269   }
270   public String getInputASName() {
271     return inputASName;
272   }
273 
274   protected hepple.postag.POSTagger tagger;
275   private java.net.URL lexiconURL;
276   private java.net.URL rulesURL;
277   private String inputASName;
278 }