1
13
14 package gate.creole;
15
16 import java.text.NumberFormat;
17 import java.util.*;
18
19 import gate.*;
20 import gate.util.GateRuntimeException;
21 import gate.util.OffsetComparator;
22
25 public class POSTagger extends AbstractLanguageAnalyser {
26
27 public static final String
28 TAG_DOCUMENT_PARAMETER_NAME = "document";
29
30 public static final String
31 TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
32
33 public static final String
34 TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
35
36 public static final String
37 TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
38
39 public static final String
40 TAG_ENCODING_PARAMETER_NAME = "encoding";
41
42 public POSTagger() {
43 }
44
45 public Resource init()throws ResourceInstantiationException{
46 if(lexiconURL == null){
47 throw new ResourceInstantiationException(
48 "NoURL provided for the lexicon!");
49 }
50 if(rulesURL == null){
51 throw new ResourceInstantiationException(
52 "No URL provided for the rules!");
53 }
54 try{
55 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
56 }catch(Exception e){
57 throw new ResourceInstantiationException(e);
58 }
59 return this;
60 }
61
62
63 public void execute() throws ExecutionException{
64 try{
65 if(document == null) throw new GateRuntimeException(
67 "No document to process!");
68 if(inputASName != null && inputASName.equals("")) inputASName = null;
69 AnnotationSet inputAS = (inputASName == null) ?
70 document.getAnnotations() :
71 document.getAnnotations(inputASName);
72
73
74 AnnotationSet sentencesAS = inputAS.get(SENTENCE_ANNOTATION_TYPE);
75 AnnotationSet tokensAS = inputAS.get(TOKEN_ANNOTATION_TYPE);
76 if(sentencesAS != null && sentencesAS.size() > 0
77 && tokensAS != null && tokensAS.size() > 0){
78 long startTime = System.currentTimeMillis();
79 fireStatusChanged("POS tagging " + document.getName());
80 fireProgressChanged(0);
81 List sentenceForTagger = new ArrayList();
83 List sentencesForTagger = new ArrayList(1);
84 sentencesForTagger.add(sentenceForTagger);
85
86 Comparator offsetComparator = new OffsetComparator();
88
89 List sentencesList = new ArrayList(sentencesAS);
91 Collections.sort(sentencesList, offsetComparator);
92 List tokensList = new ArrayList(tokensAS);
93 Collections.sort(tokensList, offsetComparator);
94
95 Iterator sentencesIter = sentencesList.iterator();
96 ListIterator tokensIter = tokensList.listIterator();
97
98 List tokensInCurrentSentence = new ArrayList();
99 Annotation currentToken = (Annotation)tokensIter.next();
100 int sentIndex = 0;
101 int sentCnt = sentencesAS.size();
102 while(sentencesIter.hasNext()){
103 Annotation currentSentence = (Annotation)sentencesIter.next();
104 tokensInCurrentSentence.clear();
105 sentenceForTagger.clear();
106 while(currentToken != null
107 &&
108 currentToken.getEndNode().getOffset().compareTo(
109 currentSentence.getEndNode().getOffset()) <= 0){
110 tokensInCurrentSentence.add(currentToken);
111 sentenceForTagger.add(currentToken.getFeatures().
112 get(TOKEN_STRING_FEATURE_NAME));
113 currentToken = (Annotation)(tokensIter.hasNext() ?
114 tokensIter.next() : null);
115 }
116 tagger.setEncoding(this.encoding);
117 List taggerList = tagger.runTagger(sentencesForTagger);
119 if(taggerList != null && taggerList.size() > 0){
120 List taggerResults = (List) taggerList.get(0);
121 if(taggerResults.size() != tokensInCurrentSentence.size())
124 throw new GateRuntimeException(
125 "POS Tagger malfunction: the output size (" +
126 taggerResults.size() +
127 ") is different from the input size (" +
128 tokensInCurrentSentence.size() + ")!");
129 Iterator resIter = taggerResults.iterator();
130 Iterator tokIter = tokensInCurrentSentence.iterator();
131 while(resIter.hasNext()){
132 ((Annotation)tokIter.next()).getFeatures().
133 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
134 }
135 }
136 fireProgressChanged(sentIndex++ * 100 / sentCnt);
137 } if(currentToken != null){
139 tokensInCurrentSentence.clear();
141 sentenceForTagger.clear();
142 while(currentToken != null){
143 tokensInCurrentSentence.add(currentToken);
144 sentenceForTagger.add(currentToken.getFeatures().
145 get(TOKEN_STRING_FEATURE_NAME));
146 currentToken = (Annotation)(tokensIter.hasNext() ?
147 tokensIter.next() : null);
148 }
149 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
151 if(taggerResults.size() != tokensInCurrentSentence.size())
154 throw new GateRuntimeException(
155 "POS Tagger malfunction: the output size (" +
156 taggerResults.size() +
157 ") is different from the input size (" +
158 tokensInCurrentSentence.size() + ")!");
159 Iterator resIter = taggerResults.iterator();
160 Iterator tokIter = tokensInCurrentSentence.iterator();
161 while(resIter.hasNext()){
162 ((Annotation)tokIter.next()).getFeatures().
163 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
164 }
165 } fireProcessFinished();
167 fireStatusChanged(
168 document.getName() + " tagged in " +
169 NumberFormat.getInstance().format(
170 (double)(System.currentTimeMillis() - startTime) / 1000) +
171 " seconds!");
172 }else{
173 throw new GateRuntimeException("No sentences or tokens to process!\n" +
174 "Please run a sentence splitter "+
175 "and tokeniser first!");
176 }
177
178
254 }catch(Exception e){
255 throw new ExecutionException(e);
256 }
257 }
258
259
260 public void setLexiconURL(java.net.URL newLexiconURL) {
261 lexiconURL = newLexiconURL;
262 }
263 public java.net.URL getLexiconURL() {
264 return lexiconURL;
265 }
266 public void setRulesURL(java.net.URL newRulesURL) {
267 rulesURL = newRulesURL;
268 }
269 public void setEncoding(String encoding) {
270 this.encoding = encoding;
271 }
272
273 public java.net.URL getRulesURL() {
274 return rulesURL;
275 }
276 public void setInputASName(String newInputASName) {
277 inputASName = newInputASName;
278 }
279 public String getInputASName() {
280 return inputASName;
281 }
282 public String getEncoding() {
283 return this.encoding;
284 }
285
286 protected hepple.postag.POSTagger tagger;
287 private java.net.URL lexiconURL;
288 private java.net.URL rulesURL;
289 private String inputASName;
290 private String encoding;
291 }
292