1   /*
2    * DefaultGazeteer.java
3    *
4    * Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    * This file is part of GATE (see http://gate.ac.uk/), and is free
7    * software, licenced under the GNU Library General Public License,
8    * Version 2, June1991.
9    *
10   * A copy of this licence is included in the distribution in the file
11   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12   *
13   * Valentin Tablan, 03/07/2000
14   * borislav popov 24/03/2002
15   *
16   * $Id: DefaultGazetteer.java,v 1.48 2004/07/21 17:10:04 akshay Exp $
17   */
18  package gate.creole.gazetteer;
19  
20  import java.util.*;
21  
22  import gate.*;
23  import gate.creole.*;
24  import gate.util.*;
25  
26  /** This component is responsible for doing lists lookup. The implementaion is
27   * based on finite state machines.
28   * The phrases to be recognised should be listed in a set of files, one for
29   * each type of occurences.
30   * The gazeteer is build with the information from a file that contains the set
31   * of lists (which are files as well) and the associated type for each list.
32   * The file defining the set of lists should have the following syntax:
33   * each list definition should be written on its own line and should contain:
34   * <ol>
35   * <li>the file name (required) </li>
36   * <li>the major type (required) </li>
37   * <li>the minor type (optional)</li>
38   * <li>the language(s) (optional) </li>
39   * </ol>
40   * The elements of each definition are separated by &quot;:&quot;.
41   * The following is an example of a valid definition: <br>
42   * <code>personmale.lst:person:male:english</code>
43   * Each list file named in the lists definition file is just a list containing
44   * one entry per line.
45   * When this gazetter will be run over some input text (a Gate document) it
46   * will generate annotations of type Lookup having the attributes specified in
47   * the definition file.
48   */
49  public class DefaultGazetteer extends AbstractGazetteer {
50  
51    /** Debug flag
52     */
53    private static final boolean DEBUG = false;
54  
55    public static final String
56      DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document";
57  
58    public static final String
59      DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
60  
61    public static final String
62      DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL";
63  
64    public static final String
65      DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding";
66  
67    public static final String
68      DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
69  
70  
71    /** a map of nodes vs gaz lists */
72    private Map listsByNode;
73  
74    /** 
75     * Build a gazetter using the default lists from the gate resources
76     */
77    public DefaultGazetteer(){
78    }
79  
80    /** Does the actual loading and parsing of the lists. This method must be
81     * called before the gazetteer can be used
82     */
83    public Resource init()throws ResourceInstantiationException{
84      fsmStates = new HashSet();
85      initialState = new FSMState(this);
86      if(listsURL == null){
87        throw new ResourceInstantiationException (
88              "No URL provided for gazetteer creation!");
89      }
90      definition = new LinearDefinition();
91      definition.setURL(listsURL);
92      definition.load();
93      int linesCnt = definition.size();
94      listsByNode = definition.loadLists();
95      Iterator inodes = definition.iterator();
96  
97      int nodeIdx = 0;
98      LinearNode node;
99      while (inodes.hasNext()) {
100       node = (LinearNode) inodes.next();
101       fireStatusChanged("Reading " + node.toString());
102       fireProgressChanged(++nodeIdx * 100 / linesCnt);
103       readList(node,true);
104     } // while iline
105     fireProcessFinished();
106     return this;
107   }
108 
109 
110   /** Reads one lists (one file) of phrases
111    *
112    * @param node the node
113    * @param add if <b>true</b> will add the phrases found in the list to the ones
114    *     recognised by this gazetter, if <b>false</b> the phrases found in the
115    *     list will be removed from the list of phrases recognised by this
116    *     gazetteer.
117    */
118   void readList(LinearNode node, boolean add) throws ResourceInstantiationException{
119     String listName, majorType, minorType, languages;
120     if ( null == node ) {
121       throw new ResourceInstantiationException(" LinearNode node is null ");
122     }
123 
124     listName = node.getList();
125     majorType = node.getMajorType();
126     minorType = node.getMinorType();
127     languages = node.getLanguage();
128     GazetteerList gazList = (GazetteerList)listsByNode.get(node);
129     if (null == gazList) {
130       throw new ResourceInstantiationException("gazetteer list not found by node");
131     }
132 
133     Iterator iline = gazList.iterator();
134 
135     Lookup lookup = new Lookup(listName,majorType, minorType, languages);
136     lookup.list = node.getList();
137     if ( null != mappingDefinition){
138       MappingNode mnode = mappingDefinition.getNodeByList(lookup.list);
139       if (null!=mnode){
140         lookup.oClass = mnode.getClassID();
141         lookup.ontology = mnode.getOntologyID();
142       }
143     }//if mapping def
144 
145     String line;
146     while(iline.hasNext()){
147       line = iline.next().toString();
148       if(add)addLookup(line, lookup);
149       else removeLookup(line, lookup);
150     }
151   } // void readList(String listDesc)
152 
153   /** Adds one phrase to the list of phrases recognised by this gazetteer
154    *
155    * @param text the phrase to be added
156    * @param lookup the description of the annotation to be added when this
157    *     phrase is recognised
158    */
159 // >>> DAM, was
160 /*
161   public void addLookup(String text, Lookup lookup) {
162     Character currentChar;
163     FSMState currentState = initialState;
164     FSMState nextState;
165     Lookup oldLookup;
166     boolean isSpace;
167 
168     for(int i = 0; i< text.length(); i++) {
169       isSpace = Character.isWhitespace(text.charAt(i));
170       if(isSpace) currentChar = new Character(' ');
171       else currentChar = (caseSensitive.booleanValue()) ?
172                           new Character(text.charAt(i)) :
173                           new Character(Character.toUpperCase(text.charAt(i))) ;
174       nextState = currentState.next(currentChar);
175       if(nextState == null){
176         nextState = new FSMState(this);
177         currentState.put(currentChar, nextState);
178         if(isSpace) nextState.put(new Character(' '),nextState);
179       }
180       currentState = nextState;
181     } //for(int i = 0; i< text.length(); i++)
182 
183     currentState.addLookup(lookup);
184     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
185 
186   } // addLookup
187 */
188 // >>> DAM: TransArray optimization
189   public void addLookup(String text, Lookup lookup) {
190     char currentChar;
191     FSMState currentState = initialState;
192     FSMState nextState;
193     Lookup oldLookup;
194     boolean isSpace;
195 
196     for(int i = 0; i< text.length(); i++) {
197         currentChar = text.charAt(i);
198         isSpace = Character.isWhitespace(currentChar);
199         if(isSpace) currentChar = ' ';
200         else currentChar = (caseSensitive.booleanValue()) ?
201                           currentChar :
202                           Character.toUpperCase(currentChar) ;
203       nextState = currentState.next(currentChar);
204       if(nextState == null){
205         nextState = new FSMState(this);
206         currentState.put(currentChar, nextState);
207         if(isSpace) nextState.put(' ',nextState);
208       }
209       currentState = nextState;
210     } //for(int i = 0; i< text.length(); i++)
211 
212     currentState.addLookup(lookup);
213     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
214 
215   } // addLookup
216 // >>> DAM, end
217 
218   /** Removes one phrase to the list of phrases recognised by this gazetteer
219    *
220    * @param text the phrase to be removed
221    * @param lookup the description of the annotation associated to this phrase
222    */
223 // >>> DAM, was
224 /*
225   public void removeLookup(String text, Lookup lookup) {
226     Character currentChar;
227     FSMState currentState = initialState;
228     FSMState nextState;
229     Lookup oldLookup;
230     boolean isSpace;
231 
232     for(int i = 0; i< text.length(); i++) {
233       isSpace = Character.isWhitespace(text.charAt(i));
234       if(isSpace) currentChar = new Character(' ');
235       else currentChar = new Character(text.charAt(i));
236       nextState = currentState.next(currentChar);
237       if(nextState == null) return;//nothing to remove
238       currentState = nextState;
239     } //for(int i = 0; i< text.length(); i++)
240     currentState.removeLookup(lookup);
241   } // removeLookup
242 */
243 // >>> DAM: TransArray optimization
244   public void removeLookup(String text, Lookup lookup) {
245     char currentChar;
246     FSMState currentState = initialState;
247     FSMState nextState;
248     Lookup oldLookup;
249 
250     for(int i = 0; i< text.length(); i++) {
251         currentChar = text.charAt(i);
252         if(Character.isWhitespace(currentChar)) currentChar = ' ';
253         nextState = currentState.next(currentChar);
254         if(nextState == null) return;//nothing to remove
255         currentState = nextState;
256     } //for(int i = 0; i< text.length(); i++)
257     currentState.removeLookup(lookup);
258   } // removeLookup
259 // >>> DAM, end
260 
261   /** Returns a string representation of the deterministic FSM graph using
262    * GML.
263    */
264   public String getFSMgml() {
265     String res = "graph[ \ndirected 1\n";
266     ///String nodes = "", edges = "";
267     StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
268                 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
269     Iterator fsmStatesIter = fsmStates.iterator();
270     while (fsmStatesIter.hasNext()){
271       FSMState currentState = (FSMState)fsmStatesIter.next();
272       int stateIndex = currentState.getIndex();
273       /*nodes += "node[ id " + stateIndex +
274                " label \"" + stateIndex;
275       */
276       nodes.append("node[ id ");
277       nodes.append(stateIndex);
278       nodes.append(" label \"");
279       nodes.append(stateIndex);
280 
281              if(currentState.isFinal()){
282               ///nodes += ",F\\n" + currentState.getLookupSet();
283               nodes.append(",F\\n");
284               nodes.append(currentState.getLookupSet());
285              }
286              ///nodes +=  "\"  ]\n";
287              nodes.append("\"  ]\n");
288       //edges += currentState.getEdgesGML();
289       edges.append(currentState.getEdgesGML());
290     }
291     res += nodes.toString() + edges.toString() + "]\n";
292     return res;
293   } // getFSMgml
294 
295 
296   /**
297    * Tests whether a character is internal to a word (i.e. if it's a letter or
298    * a combining mark (spacing or not)).
299    * @param ch the character to be tested
300    * @return a boolean value
301    */
302   public static boolean isWordInternal(char ch){
303     return Character.isLetter(ch) ||
304            Character.getType(ch) == Character.COMBINING_SPACING_MARK ||
305            Character.getType(ch) == Character.NON_SPACING_MARK;
306   }
307 
308   /**
309    * This method runs the gazetteer. It assumes that all the needed parameters
310    * are set. If they are not, an exception will be fired.
311    */
312   public void execute() throws ExecutionException{
313     interrupted = false;
314     AnnotationSet annotationSet;
315     //check the input
316     if(document == null) {
317       throw new ExecutionException(
318         "No document to process!"
319       );
320     }
321 
322     if(annotationSetName == null ||
323        annotationSetName.equals("")) annotationSet = document.getAnnotations();
324     else annotationSet = document.getAnnotations(annotationSetName);
325 
326     fireStatusChanged("Doing lookup in " +
327                            document.getName() + "...");
328     String content = document.getContent().toString();
329     int length = content.length();
330 // >>> DAM, was
331 /*
332     Character currentChar;
333 */
334 // >>> DAM: TransArray optimization
335     char currentChar;
336 // >>> DAM, end
337     FSMState currentState = initialState;
338     FSMState nextState;
339     FSMState lastMatchingState = null;
340     int matchedRegionEnd = 0;
341     int matchedRegionStart = 0;
342     int charIdx = 0;
343     int oldCharIdx = 0;
344     FeatureMap fm;
345     Lookup currentLookup;
346 
347 // >>> DAM, was
348 /*
349     while(charIdx < length) {
350       if(Character.isWhitespace(content.charAt(charIdx)))
351         currentChar = new Character(' ');
352       else currentChar = (caseSensitive.booleanValue()) ?
353                          new Character(content.charAt(charIdx)) :
354                          new Character(Character.toUpperCase(
355                                        content.charAt(charIdx)));
356 */
357 // >>> DAM: TransArray optimization
358     while(charIdx < length) {
359       currentChar = content.charAt(charIdx);
360       if(Character.isWhitespace(currentChar)) currentChar = ' ';
361       else currentChar = caseSensitive.booleanValue() ?
362                           currentChar :
363                           Character.toUpperCase(currentChar);
364 // >>> DAM, end
365       nextState = currentState.next(currentChar);
366       if(nextState == null) {
367         //the matching stopped
368 
369         //if we had a successful match then act on it;
370         if(lastMatchingState != null){
371           //let's add the new annotation(s)
372           Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
373 
374           while(lookupIter.hasNext()) {
375             currentLookup = (Lookup)lookupIter.next();
376             fm = Factory.newFeatureMap();
377             fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
378             if (null!= currentLookup.oClass && null!=currentLookup.ontology){
379               fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
380               fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
381             }
382             if(null != currentLookup.minorType) {
383               fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
384               if(null != currentLookup.languages)
385                 fm.put("language", currentLookup.languages);
386             }
387             try {
388               annotationSet.add(new Long(matchedRegionStart),
389                               new Long(matchedRegionEnd + 1),
390                               LOOKUP_ANNOTATION_TYPE,
391                               fm);
392             } catch(InvalidOffsetException ioe) {
393               throw new LuckyException(ioe.toString());
394             }
395           }//while(lookupIter.hasNext())
396           lastMatchingState = null;
397         }
398 
399         //reset the FSM
400         charIdx = matchedRegionStart + 1;
401         matchedRegionStart = charIdx;
402         currentState = initialState;
403 
404       } else{//go on with the matching
405         currentState = nextState;
406         //if we have a successful state then store it
407         if(currentState.isFinal() &&
408            (
409             (!wholeWordsOnly.booleanValue())
410              ||
411             ((matchedRegionStart == 0 ||
412              !isWordInternal(content.charAt(matchedRegionStart - 1)))
413              &&
414              (charIdx + 1 >= content.length()   ||
415              !isWordInternal(content.charAt(charIdx + 1)))
416             )
417            )
418           ){
419           matchedRegionEnd = charIdx;
420           lastMatchingState = currentState;
421         }
422         charIdx ++;
423         if(charIdx == content.length()){
424           //we can't go on, use the last matching state and restart matching
425           //from the next char
426           if(lastMatchingState != null){
427             //let's add the new annotation(s)
428             Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
429 
430             while(lookupIter.hasNext()) {
431               currentLookup = (Lookup)lookupIter.next();
432               fm = Factory.newFeatureMap();
433               fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
434               if (null!= currentLookup.oClass && null!=currentLookup.ontology){
435                 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
436                 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
437               }
438               if(null != currentLookup.minorType) {
439                 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
440                 if(null != currentLookup.languages)
441                   fm.put("language", currentLookup.languages);
442               }
443               try {
444                 annotationSet.add(new Long(matchedRegionStart),
445                                 new Long(matchedRegionEnd + 1),
446                                 LOOKUP_ANNOTATION_TYPE,
447                                 fm);
448               } catch(InvalidOffsetException ioe) {
449                 throw new LuckyException(ioe.toString());
450               }
451             }//while(lookupIter.hasNext())
452             lastMatchingState = null;
453           }
454 
455           //reset the FSM
456           charIdx = matchedRegionStart + 1;
457           matchedRegionStart = charIdx;
458           currentState = initialState;
459         }
460       }
461       if(charIdx - oldCharIdx > 256) {
462         fireProgressChanged((100 * charIdx )/ length );
463         oldCharIdx = charIdx;
464         if(isInterrupted()) throw new ExecutionInterruptedException(
465             "The execution of the " + getName() +
466             " gazetteer has been abruptly interrupted!");
467       }
468     } // while(charIdx < length)
469 
470     if(lastMatchingState != null) {
471       Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
472       while(lookupIter.hasNext()) {
473         currentLookup = (Lookup)lookupIter.next();
474         fm = Factory.newFeatureMap();
475         fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
476         if (null!= currentLookup.oClass && null!=currentLookup.ontology){
477           fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
478           fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
479         }
480 
481         if(null != currentLookup.minorType)
482           fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
483         try{
484           annotationSet.add(new Long(matchedRegionStart),
485                           new Long(matchedRegionEnd + 1),
486                           LOOKUP_ANNOTATION_TYPE,
487                           fm);
488         } catch(InvalidOffsetException ioe) {
489           throw new GateRuntimeException(ioe.toString());
490         }
491       }//while(lookupIter.hasNext())
492     }
493     fireProcessFinished();
494     fireStatusChanged("Lookup complete!");
495   } // execute
496 
497 
498   /** The initial state of the FSM that backs this gazetteer
499    */
500   FSMState initialState;
501 
502   /** A set containing all the states of the FSM backing the gazetteer
503    */
504   Set fsmStates;
505 
506   /**lookup <br>
507    * @param singleItem a single string to be looked up by the gazetteer
508    * @return set of the Lookups associated with the parameter*/
509   public Set lookup(String singleItem) {
510     char currentChar;
511     Set set = new HashSet();
512     FSMState currentState = initialState;
513     FSMState nextState;
514 
515     for(int i = 0; i< singleItem.length(); i++) {
516         currentChar = singleItem.charAt(i);
517         if(Character.isWhitespace(currentChar)) currentChar = ' ';
518         nextState = currentState.next(currentChar);
519         if(nextState == null) {
520           return set;
521         }
522         currentState = nextState;
523     } //for(int i = 0; i< text.length(); i++)
524     set = currentState.getLookupSet();
525     return set;
526   }
527 
528   public boolean remove(String singleItem) {
529     char currentChar;
530     FSMState currentState = initialState;
531     FSMState nextState;
532     Lookup oldLookup;
533 
534     for(int i = 0; i< singleItem.length(); i++) {
535         currentChar = singleItem.charAt(i);
536         if(Character.isWhitespace(currentChar)) currentChar = ' ';
537         nextState = currentState.next(currentChar);
538         if(nextState == null) {
539           return false;
540         }//nothing to remove
541         currentState = nextState;
542     } //for(int i = 0; i< text.length(); i++)
543     currentState.lookupSet = new HashSet();
544     return true;
545   }
546 
547   public boolean add(String singleItem, Lookup lookup) {
548     addLookup(singleItem,lookup);
549     return true;
550   }
551 
552 
553 } // DefaultGazetteer
554 
555 // >>> DAM: TransArray optimization, new charMap implementation
556 interface Iter
557 {
558     public boolean hasNext();
559     public char next();
560 } // iter class
561 
562 /**
563  * class implementing the map using binary serach by char as key
564  * to retrive the coresponding object.
565  */
566 class charMap
567 {
568     char[] itemsKeys = null;
569     Object[] itemsObjs = null;
570 
571     /**
572      * resize the containers by one leavaing empty elemant at position 'index'
573      */
574     void resize(int index)
575     {
576         int newsz = itemsKeys.length + 1;
577         char[] tempKeys = new char[newsz];
578         Object[] tempObjs = new Object[newsz];
579         int i;
580         for (i= 0; i < index; i++)
581         {
582             tempKeys[i] = itemsKeys[i];
583             tempObjs[i] = itemsObjs[i];
584         }
585         for (i= index+1; i < newsz; i++)
586         {
587             tempKeys[i] = itemsKeys[i-1];
588             tempObjs[i] = itemsObjs[i-1];
589         }
590 
591         itemsKeys = tempKeys;
592         itemsObjs = tempObjs;
593     } // resize
594 
595 /**
596  * get the object from the map using the char key
597  */
598     Object get(char key)
599     {
600         if (itemsKeys == null) return null;
601         int index = Arrays.binarySearch(itemsKeys, key);
602         if (index<0)
603             return null;
604         return itemsObjs[index];
605     }
606 /**
607  * put the object into the char map using the chat as the key
608  */
609     Object put(char key, Object value)
610     {
611         if (itemsKeys == null)
612         {
613             itemsKeys = new char[1];
614             itemsKeys[0] = key;
615             itemsObjs = new Object[1];
616             itemsObjs[0] = value;
617             return value;
618         }// if first time
619         int index = Arrays.binarySearch(itemsKeys, key);
620         if (index<0)
621         {
622             index = ~index;
623             resize(index);
624             itemsKeys[index] = key;
625             itemsObjs[index] = value;
626         }
627         return itemsObjs[index];
628     } // put
629 /**
630  * the keys itereator
631  * /
632     public Iter iter()
633     {
634         return new Iter()
635         {
636             int counter = 0;
637             public boolean hasNext() {return counter < itemsKeys.length;}
638             public char next() { return itemsKeys[counter];}
639         };
640     } // iter()
641  */
642 
643 } // class charMap
644 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState