|
POSTagger |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan, 01 Feb 2000 10 * 11 * $Id: POSTagger.java,v 1.17 2002/04/30 10:12:06 valyt Exp $ 12 */ 13 14 package gate.creole; 15 16 import gate.*; 17 import gate.creole.*; 18 import gate.util.*; 19 import gate.event.*; 20 21 import hepple.postag.*; 22 23 import java.util.*; 24 import java.io.*; 25 import java.net.URL; 26 import java.text.NumberFormat; 27 /** 28 * This class is a wrapper for HepTag, Mark Hepple's POS tagger. 29 */ 30 public class POSTagger extends AbstractLanguageAnalyser { 31 32 public static final String 33 TAG_DOCUMENT_PARAMETER_NAME = "document"; 34 35 public static final String 36 TAG_INPUT_AS_PARAMETER_NAME = "inputASName"; 37 38 public static final String 39 TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL"; 40 41 public static final String 42 TAG_RULES_URL_PARAMETER_NAME = "rulesURL"; 43 44 public POSTagger() { 45 } 46 47 public Resource init()throws ResourceInstantiationException{ 48 if(lexiconURL == null){ 49 throw new ResourceInstantiationException( 50 "NoURL provided for the lexicon!"); 51 } 52 if(rulesURL == null){ 53 throw new ResourceInstantiationException( 54 "No URL provided for the rules!"); 55 } 56 try{ 57 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL); 58 }catch(Exception e){ 59 throw new ResourceInstantiationException(e); 60 } 61 return this; 62 } 63 64 65 public void execute() throws ExecutionException{ 66 try{ 67 //check the parameters 68 if(document == null) throw new GateRuntimeException( 69 "No document to process!"); 70 if(inputASName != null && inputASName.equals("")) inputASName = null; 71 AnnotationSet inputAS = (inputASName == null) ? 72 document.getAnnotations() : 73 document.getAnnotations(inputASName); 74 75 76 AnnotationSet sentencesAS = inputAS.get(SENTENCE_ANNOTATION_TYPE); 77 if(sentencesAS != null && sentencesAS.size() > 0){ 78 long startTime = System.currentTimeMillis(); 79 fireStatusChanged("POS tagging " + document.getName()); 80 fireProgressChanged(0); 81 //prepare the input for HepTag 82 List sentenceForTagger = new ArrayList(); 83 List sentencesForTagger = new ArrayList(1); 84 sentencesForTagger.add(sentenceForTagger); 85 86 //define a comparator for annotations by start offset 87 Comparator offsetComparator = new OffsetComparator(); 88 89 //read all the tokens and all the sentences 90 List sentencesList = new ArrayList(sentencesAS); 91 Collections.sort(sentencesList, offsetComparator); 92 List tokensList = new ArrayList(inputAS.get(TOKEN_ANNOTATION_TYPE)); 93 Collections.sort(tokensList, offsetComparator); 94 95 Iterator sentencesIter = sentencesList.iterator(); 96 ListIterator tokensIter = tokensList.listIterator(); 97 98 List tokensInCurrentSentence = new ArrayList(); 99 Annotation currentToken = (Annotation)tokensIter.next(); 100 int sentIndex = 0; 101 int sentCnt = sentencesAS.size(); 102 while(sentencesIter.hasNext()){ 103 Annotation currentSentence = (Annotation)sentencesIter.next(); 104 tokensInCurrentSentence.clear(); 105 sentenceForTagger.clear(); 106 while(currentToken != null 107 && 108 currentToken.getEndNode().getOffset().compareTo( 109 currentSentence.getEndNode().getOffset()) <= 0){ 110 tokensInCurrentSentence.add(currentToken); 111 sentenceForTagger.add(currentToken.getFeatures(). 112 get(TOKEN_STRING_FEATURE_NAME)); 113 currentToken = (Annotation)(tokensIter.hasNext() ? 114 tokensIter.next() : null); 115 } 116 //run the POS tagger 117 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0); 118 //add the results 119 //make sure no malfunction accured 120 if(taggerResults.size() != tokensInCurrentSentence.size()) 121 throw new GateRuntimeException( 122 "POS Tagger malfunction: the output size (" + 123 taggerResults.size() + 124 ") is different from the input size (" + 125 tokensInCurrentSentence.size() + ")!"); 126 Iterator resIter = taggerResults.iterator(); 127 Iterator tokIter = tokensInCurrentSentence.iterator(); 128 while(resIter.hasNext()){ 129 ((Annotation)tokIter.next()).getFeatures(). 130 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]); 131 } 132 fireProgressChanged(sentIndex++ * 100 / sentCnt); 133 }//while(sentencesIter.hasNext()) 134 if(currentToken != null){ 135 //we have remaining tokens after the last sentence 136 tokensInCurrentSentence.clear(); 137 sentenceForTagger.clear(); 138 while(currentToken != null){ 139 tokensInCurrentSentence.add(currentToken); 140 sentenceForTagger.add(currentToken.getFeatures(). 141 get(TOKEN_STRING_FEATURE_NAME)); 142 currentToken = (Annotation)(tokensIter.hasNext() ? 143 tokensIter.next() : null); 144 } 145 //run the POS tagger 146 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0); 147 //add the results 148 //make sure no malfunction accured 149 if(taggerResults.size() != tokensInCurrentSentence.size()) 150 throw new GateRuntimeException( 151 "POS Tagger malfunction: the output size (" + 152 taggerResults.size() + 153 ") is different from the input size (" + 154 tokensInCurrentSentence.size() + ")!"); 155 Iterator resIter = taggerResults.iterator(); 156 Iterator tokIter = tokensInCurrentSentence.iterator(); 157 while(resIter.hasNext()){ 158 ((Annotation)tokIter.next()).getFeatures(). 159 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]); 160 } 161 }//if(currentToken != null) 162 fireProcessFinished(); 163 fireStatusChanged( 164 document.getName() + " tagged in " + 165 NumberFormat.getInstance().format( 166 (double)(System.currentTimeMillis() - startTime) / 1000) + 167 " seconds!"); 168 }else{ 169 throw new GateRuntimeException("No sentences to process!\n" + 170 "Please run a sentence splitter first!"); 171 } 172 173 //OLD version 174 /* 175 AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE); 176 if(as != null && as.size() > 0){ 177 List sentences = new ArrayList(as); 178 Collections.sort(sentences, offsetComparator); 179 Iterator sentIter = sentences.iterator(); 180 int sentIndex = 0; 181 int sentCnt = sentences.size(); 182 long startTime= System.currentTimeMillis(); 183 while(sentIter.hasNext()){ 184 start = System.currentTimeMillis(); 185 Annotation sentenceAnn = (Annotation)sentIter.next(); 186 AnnotationSet rangeSet = inputAS.get( 187 sentenceAnn.getStartNode().getOffset(), 188 sentenceAnn.getEndNode().getOffset()); 189 if(rangeSet == null) continue; 190 AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE); 191 if(tokensSet == null) continue; 192 List tokens = new ArrayList(tokensSet); 193 Collections.sort(tokens, offsetComparator); 194 195 // List tokens = (List)sentenceAnn.getFeatures().get("tokens"); 196 List sentence = new ArrayList(tokens.size()); 197 Iterator tokIter = tokens.iterator(); 198 while(tokIter.hasNext()){ 199 Annotation token = (Annotation)tokIter.next(); 200 String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 201 sentence.add(text); 202 }//while(tokIter.hasNext()) 203 204 //run the POSTagger over this sentence 205 List sentences4tagger = new ArrayList(1); 206 sentences4tagger.add(sentence); 207 prepTime += System.currentTimeMillis() - start; 208 start = System.currentTimeMillis(); 209 List taggerResults = tagger.runTagger(sentences4tagger); 210 posTime += System.currentTimeMillis() - start; 211 start = System.currentTimeMillis(); 212 //add the results to the output annotation set 213 //we only get one sentence 214 List sentenceFromTagger = (List)taggerResults.get(0); 215 if(sentenceFromTagger.size() != sentence.size()){ 216 String taggerResult = ""; 217 for(int i = 0; i< sentenceFromTagger.size(); i++){ 218 taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", "; 219 } 220 throw new GateRuntimeException( 221 "POS Tagger malfunction: the output size (" + 222 sentenceFromTagger.size() + 223 ") is different from the input size (" + 224 sentence.size() + ")!" + 225 "\n Input: " + sentence + "\nOutput: " + taggerResult); 226 } 227 for(int i = 0; i< sentence.size(); i++){ 228 String category = ((String[])sentenceFromTagger.get(i))[1]; 229 Annotation token = (Annotation)tokens.get(i); 230 token.getFeatures(). 231 put(TOKEN_CATEGORY_FEATURE_NAME, category); 232 }//for(i = 0; i<= sentence.size(); i++) 233 postTime += System.currentTimeMillis() - start; 234 fireProgressChanged(sentIndex++ * 100 / sentCnt); 235 }//while(sentIter.hasNext()) 236 Out.prln("POS preparation time:" + prepTime); 237 Out.prln("POS execution time:" + posTime); 238 Out.prln("POS after execution time:" + postTime); 239 fireProcessFinished(); 240 long endTime = System.currentTimeMillis(); 241 fireStatusChanged(document.getName() + " tagged in " + 242 NumberFormat.getInstance().format( 243 (double)(endTime - startTime) / 1000) + " seconds!"); 244 }else{ 245 throw new GateRuntimeException("No sentences to process!\n" + 246 "Please run a sentence splitter first!"); 247 }//if(as != null && as.size() > 0) 248 */ 249 }catch(Exception e){ 250 throw new ExecutionException(e); 251 } 252 } 253 254 255 public void setLexiconURL(java.net.URL newLexiconURL) { 256 lexiconURL = newLexiconURL; 257 } 258 public java.net.URL getLexiconURL() { 259 return lexiconURL; 260 } 261 public void setRulesURL(java.net.URL newRulesURL) { 262 rulesURL = newRulesURL; 263 } 264 public java.net.URL getRulesURL() { 265 return rulesURL; 266 } 267 public void setInputASName(String newInputASName) { 268 inputASName = newInputASName; 269 } 270 public String getInputASName() { 271 return inputASName; 272 } 273 274 protected hepple.postag.POSTagger tagger; 275 private java.net.URL lexiconURL; 276 private java.net.URL rulesURL; 277 private String inputASName; 278 }
|
POSTagger |
|