1
18 package gate.creole.gazetteer;
19
20 import java.util.*;
21
22 import gate.*;
23 import gate.creole.*;
24 import gate.util.*;
25
26
49 public class DefaultGazetteer extends AbstractGazetteer {
50
51
53 private static final boolean DEBUG = false;
54
55 public static final String
56 DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document";
57
58 public static final String
59 DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
60
61 public static final String
62 DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL";
63
64 public static final String
65 DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding";
66
67 public static final String
68 DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
69
70
71
72 private Map listsByNode;
73
74
77 public DefaultGazetteer(){
78 }
79
80
83 public Resource init()throws ResourceInstantiationException{
84 fsmStates = new HashSet();
85 initialState = new FSMState(this);
86 if(listsURL == null){
87 throw new ResourceInstantiationException (
88 "No URL provided for gazetteer creation!");
89 }
90 definition = new LinearDefinition();
91 definition.setURL(listsURL);
92 definition.load();
93 int linesCnt = definition.size();
94 listsByNode = definition.loadLists();
95 Iterator inodes = definition.iterator();
96
97 int nodeIdx = 0;
98 LinearNode node;
99 while (inodes.hasNext()) {
100 node = (LinearNode) inodes.next();
101 fireStatusChanged("Reading " + node.toString());
102 fireProgressChanged(++nodeIdx * 100 / linesCnt);
103 readList(node,true);
104 } fireProcessFinished();
106 return this;
107 }
108
109
110
118 void readList(LinearNode node, boolean add) throws ResourceInstantiationException{
119 String listName, majorType, minorType, languages;
120 if ( null == node ) {
121 throw new ResourceInstantiationException(" LinearNode node is null ");
122 }
123
124 listName = node.getList();
125 majorType = node.getMajorType();
126 minorType = node.getMinorType();
127 languages = node.getLanguage();
128 GazetteerList gazList = (GazetteerList)listsByNode.get(node);
129 if (null == gazList) {
130 throw new ResourceInstantiationException("gazetteer list not found by node");
131 }
132
133 Iterator iline = gazList.iterator();
134
135 Lookup lookup = new Lookup(listName,majorType, minorType, languages);
136 lookup.list = node.getList();
137 if ( null != mappingDefinition){
138 MappingNode mnode = mappingDefinition.getNodeByList(lookup.list);
139 if (null!=mnode){
140 lookup.oClass = mnode.getClassID();
141 lookup.ontology = mnode.getOntologyID();
142 }
143 }
145 String line;
146 while(iline.hasNext()){
147 line = iline.next().toString();
148 if(add)addLookup(line, lookup);
149 else removeLookup(line, lookup);
150 }
151 }
153
159
188 public void addLookup(String text, Lookup lookup) {
190 char currentChar;
191 FSMState currentState = initialState;
192 FSMState nextState;
193 Lookup oldLookup;
194 boolean isSpace;
195
196 for(int i = 0; i< text.length(); i++) {
197 currentChar = text.charAt(i);
198 isSpace = Character.isWhitespace(currentChar);
199 if(isSpace) currentChar = ' ';
200 else currentChar = (caseSensitive.booleanValue()) ?
201 currentChar :
202 Character.toUpperCase(currentChar) ;
203 nextState = currentState.next(currentChar);
204 if(nextState == null){
205 nextState = new FSMState(this);
206 currentState.put(currentChar, nextState);
207 if(isSpace) nextState.put(' ',nextState);
208 }
209 currentState = nextState;
210 }
212 currentState.addLookup(lookup);
213
215 }
218
223
243 public void removeLookup(String text, Lookup lookup) {
245 char currentChar;
246 FSMState currentState = initialState;
247 FSMState nextState;
248 Lookup oldLookup;
249
250 for(int i = 0; i< text.length(); i++) {
251 currentChar = text.charAt(i);
252 if(Character.isWhitespace(currentChar)) currentChar = ' ';
253 nextState = currentState.next(currentChar);
254 if(nextState == null) return; currentState = nextState;
256 } currentState.removeLookup(lookup);
258 }
261
264 public String getFSMgml() {
265 String res = "graph[ \ndirected 1\n";
266 StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
268 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
269 Iterator fsmStatesIter = fsmStates.iterator();
270 while (fsmStatesIter.hasNext()){
271 FSMState currentState = (FSMState)fsmStatesIter.next();
272 int stateIndex = currentState.getIndex();
273
276 nodes.append("node[ id ");
277 nodes.append(stateIndex);
278 nodes.append(" label \"");
279 nodes.append(stateIndex);
280
281 if(currentState.isFinal()){
282 nodes.append(",F\\n");
284 nodes.append(currentState.getLookupSet());
285 }
286 nodes.append("\" ]\n");
288 edges.append(currentState.getEdgesGML());
290 }
291 res += nodes.toString() + edges.toString() + "]\n";
292 return res;
293 }
295
296
302 public static boolean isWordInternal(char ch){
303 return Character.isLetter(ch) ||
304 Character.getType(ch) == Character.COMBINING_SPACING_MARK ||
305 Character.getType(ch) == Character.NON_SPACING_MARK;
306 }
307
308
312 public void execute() throws ExecutionException{
313 interrupted = false;
314 AnnotationSet annotationSet;
315 if(document == null) {
317 throw new ExecutionException(
318 "No document to process!"
319 );
320 }
321
322 if(annotationSetName == null ||
323 annotationSetName.equals("")) annotationSet = document.getAnnotations();
324 else annotationSet = document.getAnnotations(annotationSetName);
325
326 fireStatusChanged("Doing lookup in " +
327 document.getName() + "...");
328 String content = document.getContent().toString();
329 int length = content.length();
330
334 char currentChar;
336 FSMState currentState = initialState;
338 FSMState nextState;
339 FSMState lastMatchingState = null;
340 int matchedRegionEnd = 0;
341 int matchedRegionStart = 0;
342 int charIdx = 0;
343 int oldCharIdx = 0;
344 FeatureMap fm;
345 Lookup currentLookup;
346
347
357 while(charIdx < length) {
359 currentChar = content.charAt(charIdx);
360 if(Character.isWhitespace(currentChar)) currentChar = ' ';
361 else currentChar = caseSensitive.booleanValue() ?
362 currentChar :
363 Character.toUpperCase(currentChar);
364 nextState = currentState.next(currentChar);
366 if(nextState == null) {
367
369 if(lastMatchingState != null){
371 Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
373
374 while(lookupIter.hasNext()) {
375 currentLookup = (Lookup)lookupIter.next();
376 fm = Factory.newFeatureMap();
377 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
378 if (null!= currentLookup.oClass && null!=currentLookup.ontology){
379 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
380 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
381 }
382 if(null != currentLookup.minorType) {
383 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
384 if(null != currentLookup.languages)
385 fm.put("language", currentLookup.languages);
386 }
387 try {
388 annotationSet.add(new Long(matchedRegionStart),
389 new Long(matchedRegionEnd + 1),
390 LOOKUP_ANNOTATION_TYPE,
391 fm);
392 } catch(InvalidOffsetException ioe) {
393 throw new LuckyException(ioe.toString());
394 }
395 } lastMatchingState = null;
397 }
398
399 charIdx = matchedRegionStart + 1;
401 matchedRegionStart = charIdx;
402 currentState = initialState;
403
404 } else{ currentState = nextState;
406 if(currentState.isFinal() &&
408 (
409 (!wholeWordsOnly.booleanValue())
410 ||
411 ((matchedRegionStart == 0 ||
412 !isWordInternal(content.charAt(matchedRegionStart - 1)))
413 &&
414 (charIdx + 1 >= content.length() ||
415 !isWordInternal(content.charAt(charIdx + 1)))
416 )
417 )
418 ){
419 matchedRegionEnd = charIdx;
420 lastMatchingState = currentState;
421 }
422 charIdx ++;
423 if(charIdx == content.length()){
424 if(lastMatchingState != null){
427 Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
429
430 while(lookupIter.hasNext()) {
431 currentLookup = (Lookup)lookupIter.next();
432 fm = Factory.newFeatureMap();
433 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
434 if (null!= currentLookup.oClass && null!=currentLookup.ontology){
435 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
436 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
437 }
438 if(null != currentLookup.minorType) {
439 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
440 if(null != currentLookup.languages)
441 fm.put("language", currentLookup.languages);
442 }
443 try {
444 annotationSet.add(new Long(matchedRegionStart),
445 new Long(matchedRegionEnd + 1),
446 LOOKUP_ANNOTATION_TYPE,
447 fm);
448 } catch(InvalidOffsetException ioe) {
449 throw new LuckyException(ioe.toString());
450 }
451 } lastMatchingState = null;
453 }
454
455 charIdx = matchedRegionStart + 1;
457 matchedRegionStart = charIdx;
458 currentState = initialState;
459 }
460 }
461 if(charIdx - oldCharIdx > 256) {
462 fireProgressChanged((100 * charIdx )/ length );
463 oldCharIdx = charIdx;
464 if(isInterrupted()) throw new ExecutionInterruptedException(
465 "The execution of the " + getName() +
466 " gazetteer has been abruptly interrupted!");
467 }
468 }
470 if(lastMatchingState != null) {
471 Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
472 while(lookupIter.hasNext()) {
473 currentLookup = (Lookup)lookupIter.next();
474 fm = Factory.newFeatureMap();
475 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
476 if (null!= currentLookup.oClass && null!=currentLookup.ontology){
477 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
478 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
479 }
480
481 if(null != currentLookup.minorType)
482 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
483 try{
484 annotationSet.add(new Long(matchedRegionStart),
485 new Long(matchedRegionEnd + 1),
486 LOOKUP_ANNOTATION_TYPE,
487 fm);
488 } catch(InvalidOffsetException ioe) {
489 throw new GateRuntimeException(ioe.toString());
490 }
491 } }
493 fireProcessFinished();
494 fireStatusChanged("Lookup complete!");
495 }
497
498
500 FSMState initialState;
501
502
504 Set fsmStates;
505
506
509 public Set lookup(String singleItem) {
510 char currentChar;
511 Set set = new HashSet();
512 FSMState currentState = initialState;
513 FSMState nextState;
514
515 for(int i = 0; i< singleItem.length(); i++) {
516 currentChar = singleItem.charAt(i);
517 if(Character.isWhitespace(currentChar)) currentChar = ' ';
518 nextState = currentState.next(currentChar);
519 if(nextState == null) {
520 return set;
521 }
522 currentState = nextState;
523 } set = currentState.getLookupSet();
525 return set;
526 }
527
528 public boolean remove(String singleItem) {
529 char currentChar;
530 FSMState currentState = initialState;
531 FSMState nextState;
532 Lookup oldLookup;
533
534 for(int i = 0; i< singleItem.length(); i++) {
535 currentChar = singleItem.charAt(i);
536 if(Character.isWhitespace(currentChar)) currentChar = ' ';
537 nextState = currentState.next(currentChar);
538 if(nextState == null) {
539 return false;
540 } currentState = nextState;
542 } currentState.lookupSet = new HashSet();
544 return true;
545 }
546
547 public boolean add(String singleItem, Lookup lookup) {
548 addLookup(singleItem,lookup);
549 return true;
550 }
551
552
553 }
555 interface Iter
557 {
558 public boolean hasNext();
559 public char next();
560 }
562
566 class charMap
567 {
568 char[] itemsKeys = null;
569 Object[] itemsObjs = null;
570
571
574 void resize(int index)
575 {
576 int newsz = itemsKeys.length + 1;
577 char[] tempKeys = new char[newsz];
578 Object[] tempObjs = new Object[newsz];
579 int i;
580 for (i= 0; i < index; i++)
581 {
582 tempKeys[i] = itemsKeys[i];
583 tempObjs[i] = itemsObjs[i];
584 }
585 for (i= index+1; i < newsz; i++)
586 {
587 tempKeys[i] = itemsKeys[i-1];
588 tempObjs[i] = itemsObjs[i-1];
589 }
590
591 itemsKeys = tempKeys;
592 itemsObjs = tempObjs;
593 }
595
598 Object get(char key)
599 {
600 if (itemsKeys == null) return null;
601 int index = Arrays.binarySearch(itemsKeys, key);
602 if (index<0)
603 return null;
604 return itemsObjs[index];
605 }
606
609 Object put(char key, Object value)
610 {
611 if (itemsKeys == null)
612 {
613 itemsKeys = new char[1];
614 itemsKeys[0] = key;
615 itemsObjs = new Object[1];
616 itemsObjs[0] = value;
617 return value;
618 } int index = Arrays.binarySearch(itemsKeys, key);
620 if (index<0)
621 {
622 index = ~index;
623 resize(index);
624 itemsKeys[index] = key;
625 itemsObjs[index] = value;
626 }
627 return itemsObjs[index];
628 }
642
643 }