1
16
17 package gate.creole.gazetteer;
18
19 import java.util.*;
20 import gate.util.*;
21 import gate.*;
22 import gate.creole.*;
23
24
43
44 public class FlexibleGazetteer
45 extends AbstractLanguageAnalyser
46 implements ProcessingResource {
47
48
51 public FlexibleGazetteer() {
52 changedNodes = new ArrayList();
53 }
54
55
58 public Resource init() throws ResourceInstantiationException {
59
60
71 return this;
72 }
73
74
78 public void execute() throws ExecutionException {
79 fireProgressChanged(0);
80 fireStatusChanged("Checking Document...");
81 if (document == null) {
82 throw new ExecutionException(
83 "No document to process!"
84 );
85 }
86
87 fireStatusChanged("Creating temporary Document...");
88 StringBuffer newdocString = new StringBuffer(document.getContent().toString());
89 Document tempDoc = null;
90 boolean chineseSplit = false;
91
92 if (inputFeatureNames == null || inputFeatureNames.size() == 0) {
93 inputFeatureNames = new ArrayList();
94 }
95
96 Iterator tokenIter = getTokenIterator(document, inputAnnotationSetName);
97 long totalDeductedSpaces = 0;
98 fireStatusChanged("Replacing contents with the feature value...");
99
100 outer:while (tokenIter != null && tokenIter.hasNext()) {
101 Annotation currentToken = (Annotation) tokenIter.next();
102
103 if (currentToken.getType().equals(ANNIEConstants.
106 SPACE_TOKEN_ANNOTATION_TYPE) &&
107 ( (String) (currentToken.getFeatures().get(ANNIEConstants.
108 TOKEN_KIND_FEATURE_NAME))).equals("ChineseSplit")) {
109
110 long startOffset = currentToken.getStartNode().getOffset().
112 longValue();
113
114 long newStartOffset = startOffset - totalDeductedSpaces;
117 long newEndOffset = newStartOffset + 1;
118 NodePosition newNode = new NodePosition(startOffset, startOffset,
119 newStartOffset, newEndOffset,
120 totalDeductedSpaces);
121 chineseSplit = true;
122
123 totalDeductedSpaces--;
125 changedNodes.add(newNode);
126 newdocString = newdocString.insert( (int) newStartOffset, ' ');
127 continue outer;
128 }
129
130 inner:for (int i = 0; i < inputFeatureNames.size(); i++) {
134 String[] keyVal = ( (String) (inputFeatureNames.get(i))).split("[.]");
135
136 if (keyVal.length == 2) {
137 if (currentToken.getType().equals(keyVal[0])) {
140 FeatureMap features = currentToken.getFeatures();
141 String newTokenValue = (String) (features.get(keyVal[1]));
142
143 if (newTokenValue == null) {
145 continue;
146
147 }
148 else {
149 long startOffset = currentToken.getStartNode().getOffset().
152 longValue();
153 long endOffset = currentToken.getEndNode().getOffset().
154 longValue();
155
156 String actualString = (String) (features.get(ANNIEConstants.
158 TOKEN_STRING_FEATURE_NAME));
159
160 if (actualString.equals(newTokenValue)) {
163 break inner;
165 }
166
167 long lengthDifference = actualString.length() -
170 newTokenValue.length();
171
172 long newStartOffset = startOffset - totalDeductedSpaces;
174 long newEndOffset = newStartOffset + newTokenValue.length();
175
176 NodePosition newNode = new NodePosition(startOffset,
178 endOffset,
179 newStartOffset, newEndOffset, totalDeductedSpaces);
180 changedNodes.add(newNode);
181 totalDeductedSpaces += lengthDifference;
184
185 newdocString = newdocString.replace( (int) newStartOffset,
188 (int) newStartOffset +
189 actualString.length(),
190 newTokenValue);
191 break inner;
192 }
193 }
194 }
195 }
196 }
197
198 fireStatusChanged("New Document to be processed with Gazetteer...");
199 try {
200 FeatureMap params = Factory.newFeatureMap();
201 params.put("stringContent", newdocString.toString());
202 FeatureMap features = Factory.newFeatureMap();
203 Gate.setHiddenAttribute(features, true);
204 tempDoc = (Document) Factory.createResource("gate.corpora.DocumentImpl",
205 params, features);
206 }
207 catch (ResourceInstantiationException rie) {
208 throw new ExecutionException("Temporary document cannot be created");
209 }
210
211 FeatureMap params = Factory.newFeatureMap();
213 gazetteerInst.setDocument(tempDoc);
214 gazetteerInst.setAnnotationSetName(this.outputAnnotationSetName);
215
216 fireStatusChanged("Executing Gazetteer...");
217 gazetteerInst.execute();
218
219 fireStatusChanged("Transfering new tags to the original one...");
222 Iterator tokensIter = getTokenIterator(tempDoc, outputAnnotationSetName);
223 AnnotationSet original = (outputAnnotationSetName == null) ?
224 document.getAnnotations() :
225 document.getAnnotations(outputAnnotationSetName);
226 long totalSpaceAdded = 0;
227 long difference = 0;
228
229 int foundNode = -1;
230 while (tokensIter != null && tokensIter.hasNext()) {
231 Annotation currentToken = (Annotation) (tokensIter.next());
232 long startOffset = currentToken.getStartNode().getOffset().longValue();
233 long endOffset = currentToken.getEndNode().getOffset().longValue();
234
235 int i = foundNode + 1;
238 boolean found = false;
239 inner1:for (; i < changedNodes.size(); i++) {
240
241 NodePosition tempNode = (NodePosition) (changedNodes.get(i));
242
243 if (tempNode.getNewStartNode() > startOffset) {
247 i = i - 1;
252 break inner1;
253 }
254
255 if (tempNode.getNewStartNode() == startOffset) {
257
259 int k = i;
261 for (;
262 k >= 0 && k < changedNodes.size() &&
263 endOffset >
264 ( (NodePosition) changedNodes.get(k)).getNewStartNode(); k++)
265 ;
266 long spacesToAdd = 0;
267 if (k - 1 == i && k - 1 >= 0) {
268 spacesToAdd = (tempNode.getOldEndNode() - tempNode.getNewEndNode());
269 }
270 else if (k - 1 < 0) {
271 spacesToAdd = 0;
272 }
273 else {
274 spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
275 getOldEndNode() -
276 ( (NodePosition) changedNodes.get(k - 1)).
277 getNewEndNode();
278 }
279
280 FeatureMap newFeatureMap = currentToken.getFeatures();
283 try {
284
285 original.add(new Long(startOffset +
286 (tempNode.getOldStartNode() -
287 tempNode.getNewStartNode())),
288 new Long(endOffset + spacesToAdd),
289 ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
292 newFeatureMap);
293
294 }
295 catch (InvalidOffsetException ioe) {
296 throw new ExecutionException("Offset Error");
297 }
298 found = true;
299 foundNode = i;
300 break inner1;
301 }
302 }
303
304 if (!found) {
305 long totalStartSpaces = 0;
306 long totalEndSpaces = 0;
307
308 i = (changedNodes.size() == i) ? i - 1 : i;
311
312 int k = i;
314 for (;
315 k > 0 && k < changedNodes.size() &&
316 endOffset > ( (NodePosition) changedNodes.get(k)).getNewStartNode();
317 k++)
318 ;
319 long spacesToAdd = 0;
320 if (k - 1 == i && k - 1 >= 0) {
321 spacesToAdd = ( ( (NodePosition) changedNodes.get(i)).getOldEndNode() -
322 ( (NodePosition) changedNodes.get(i)).getNewEndNode());
323 }
324 else if (k - 1 < 0) {
325 spacesToAdd = 0;
326 }
327 else {
328 spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
329 getOldEndNode() -
330 ( (NodePosition) changedNodes.get(k - 1)).getNewEndNode();
331 }
332
333 if (i >= 0) {
334 totalStartSpaces = ( (NodePosition) changedNodes.get(i)).
338 getOldEndNode() -
339 ( (NodePosition) changedNodes.get(i)).
340 getNewEndNode();
341 totalEndSpaces = spacesToAdd;
345 foundNode = i;
346 }
347
348 FeatureMap newFeatureMap = currentToken.getFeatures();
350 try {
351 original.add(new Long(startOffset + totalStartSpaces),
352 new Long(endOffset + totalEndSpaces),
353 ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
354 newFeatureMap);
355 }
356 catch (InvalidOffsetException ioe) {
357 throw new ExecutionException("Offset Error");
358 }
359
360 }
361 }
362
363 Factory.deleteResource(tempDoc);
365 fireProcessFinished();
366 }
367
368
372 public void setDocument(gate.Document doc) {
373 this.document = doc;
374 }
375
376
380 public gate.Document getDocument() {
381 return this.document;
382 }
383
384
388 public void setOutputAnnotationSetName(String annName) {
389 this.outputAnnotationSetName = annName;
390 }
391
392
396 public String getOutputAnnotationSetName() {
397 return this.outputAnnotationSetName;
398 }
399
400
404 public void setInputAnnotationSetName(String annName) {
405 this.inputAnnotationSetName = annName;
406 }
407
408
412 public String getInputAnnotationSetName() {
413 return this.inputAnnotationSetName;
414 }
415
416
422 public void setInputFeatureNames(java.util.List inputs) {
423 this.inputFeatureNames = inputs;
424 }
425
426
431 public java.util.List getInputFeatureNames() {
432 return this.inputFeatureNames;
433 }
434
435 public Gazetteer getGazetteerInst() {
436 return this.gazetteerInst;
437 }
438
439 public void setGazetteerInst(gate.creole.gazetteer.Gazetteer gazetteerInst) {
440 this.gazetteerInst = gazetteerInst;
441 }
442
443
451 public Iterator getTokenIterator(gate.Document doc, String annotationSetName) {
452 AnnotationSet inputAs = (annotationSetName == null) ? doc.getAnnotations() :
453 doc.getAnnotations(annotationSetName);
454 AnnotationSet tempSet = inputAs.get();
455 if(tempSet == null)
456 return null;
457
458 List tokens = new ArrayList(inputAs.get());
459
460 if(tokens == null)
461 return null;
462
463 Comparator offsetComparator = new OffsetComparator();
464 Collections.sort(tokens, offsetComparator);
465 Iterator tokenIter = tokens.iterator();
466 return tokenIter;
467 }
468
469 private gate.Document document;
471 private java.lang.String outputAnnotationSetName;
472 private java.lang.String inputAnnotationSetName;
473
474 private Gazetteer gazetteerInst;
476 private java.util.List inputFeatureNames;
477
478 private ArrayList changedNodes;
480 }