1
13
14 package gate.creole;
15
16 import java.text.NumberFormat;
17 import java.util.*;
18
19 import gate.*;
20 import gate.util.GateRuntimeException;
21 import gate.util.OffsetComparator;
22
25 public class POSTagger extends AbstractLanguageAnalyser {
26
27 public static final String
28 TAG_DOCUMENT_PARAMETER_NAME = "document";
29
30 public static final String
31 TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
32
33 public static final String
34 TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
35
36 public static final String
37 TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
38
39 public POSTagger() {
40 }
41
42 public Resource init()throws ResourceInstantiationException{
43 if(lexiconURL == null){
44 throw new ResourceInstantiationException(
45 "NoURL provided for the lexicon!");
46 }
47 if(rulesURL == null){
48 throw new ResourceInstantiationException(
49 "No URL provided for the rules!");
50 }
51 try{
52 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
53 }catch(Exception e){
54 throw new ResourceInstantiationException(e);
55 }
56 return this;
57 }
58
59
60 public void execute() throws ExecutionException{
61 try{
62 if(document == null) throw new GateRuntimeException(
64 "No document to process!");
65 if(inputASName != null && inputASName.equals("")) inputASName = null;
66 AnnotationSet inputAS = (inputASName == null) ?
67 document.getAnnotations() :
68 document.getAnnotations(inputASName);
69
70
71 AnnotationSet sentencesAS = inputAS.get(SENTENCE_ANNOTATION_TYPE);
72 AnnotationSet tokensAS = inputAS.get(TOKEN_ANNOTATION_TYPE);
73 if(sentencesAS != null && sentencesAS.size() > 0
74 && tokensAS != null && tokensAS.size() > 0){
75 long startTime = System.currentTimeMillis();
76 fireStatusChanged("POS tagging " + document.getName());
77 fireProgressChanged(0);
78 List sentenceForTagger = new ArrayList();
80 List sentencesForTagger = new ArrayList(1);
81 sentencesForTagger.add(sentenceForTagger);
82
83 Comparator offsetComparator = new OffsetComparator();
85
86 List sentencesList = new ArrayList(sentencesAS);
88 Collections.sort(sentencesList, offsetComparator);
89 List tokensList = new ArrayList(tokensAS);
90 Collections.sort(tokensList, offsetComparator);
91
92 Iterator sentencesIter = sentencesList.iterator();
93 ListIterator tokensIter = tokensList.listIterator();
94
95 List tokensInCurrentSentence = new ArrayList();
96 Annotation currentToken = (Annotation)tokensIter.next();
97 int sentIndex = 0;
98 int sentCnt = sentencesAS.size();
99 while(sentencesIter.hasNext()){
100 Annotation currentSentence = (Annotation)sentencesIter.next();
101 tokensInCurrentSentence.clear();
102 sentenceForTagger.clear();
103 while(currentToken != null
104 &&
105 currentToken.getEndNode().getOffset().compareTo(
106 currentSentence.getEndNode().getOffset()) <= 0){
107 tokensInCurrentSentence.add(currentToken);
108 sentenceForTagger.add(currentToken.getFeatures().
109 get(TOKEN_STRING_FEATURE_NAME));
110 currentToken = (Annotation)(tokensIter.hasNext() ?
111 tokensIter.next() : null);
112 }
113 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
115 if(taggerResults.size() != tokensInCurrentSentence.size())
118 throw new GateRuntimeException(
119 "POS Tagger malfunction: the output size (" +
120 taggerResults.size() +
121 ") is different from the input size (" +
122 tokensInCurrentSentence.size() + ")!");
123 Iterator resIter = taggerResults.iterator();
124 Iterator tokIter = tokensInCurrentSentence.iterator();
125 while(resIter.hasNext()){
126 ((Annotation)tokIter.next()).getFeatures().
127 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
128 }
129 fireProgressChanged(sentIndex++ * 100 / sentCnt);
130 } if(currentToken != null){
132 tokensInCurrentSentence.clear();
134 sentenceForTagger.clear();
135 while(currentToken != null){
136 tokensInCurrentSentence.add(currentToken);
137 sentenceForTagger.add(currentToken.getFeatures().
138 get(TOKEN_STRING_FEATURE_NAME));
139 currentToken = (Annotation)(tokensIter.hasNext() ?
140 tokensIter.next() : null);
141 }
142 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
144 if(taggerResults.size() != tokensInCurrentSentence.size())
147 throw new GateRuntimeException(
148 "POS Tagger malfunction: the output size (" +
149 taggerResults.size() +
150 ") is different from the input size (" +
151 tokensInCurrentSentence.size() + ")!");
152 Iterator resIter = taggerResults.iterator();
153 Iterator tokIter = tokensInCurrentSentence.iterator();
154 while(resIter.hasNext()){
155 ((Annotation)tokIter.next()).getFeatures().
156 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
157 }
158 } fireProcessFinished();
160 fireStatusChanged(
161 document.getName() + " tagged in " +
162 NumberFormat.getInstance().format(
163 (double)(System.currentTimeMillis() - startTime) / 1000) +
164 " seconds!");
165 }else{
166 throw new GateRuntimeException("No sentences or tokens to process!\n" +
167 "Please run a sentence splitter "+
168 "and tokeniser first!");
169 }
170
171
247 }catch(Exception e){
248 throw new ExecutionException(e);
249 }
250 }
251
252
253 public void setLexiconURL(java.net.URL newLexiconURL) {
254 lexiconURL = newLexiconURL;
255 }
256 public java.net.URL getLexiconURL() {
257 return lexiconURL;
258 }
259 public void setRulesURL(java.net.URL newRulesURL) {
260 rulesURL = newRulesURL;
261 }
262 public java.net.URL getRulesURL() {
263 return rulesURL;
264 }
265 public void setInputASName(String newInputASName) {
266 inputASName = newInputASName;
267 }
268 public String getInputASName() {
269 return inputASName;
270 }
271
272 protected hepple.postag.POSTagger tagger;
273 private java.net.URL lexiconURL;
274 private java.net.URL rulesURL;
275 private String inputASName;
276 }