1   /*
2    * DefaultGazeteer.java
3    *
4    * Copyright (c) 2000-2001, The University of Sheffield.
5    *
6    * This file is part of GATE (see http://gate.ac.uk/), and is free
7    * software, licenced under the GNU Library General Public License,
8    * Version 2, June1991.
9    *
10   * A copy of this licence is included in the distribution in the file
11   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12   *
13   * Valentin Tablan, 03/07/2000
14   * borislav popov 24/03/2002
15   *
16   * $Id: DefaultGazetteer.java,v 1.44 2003/07/16 14:36:59 valyt Exp $
17   */
18  package gate.creole.gazetteer;
19  
20  import java.io.*;
21  import java.util.*;
22  import java.net.*;
23  
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.event.*;
27  import gate.*;
28  
29  /** This component is responsible for doing lists lookup. The implementaion is
30   * based on finite state machines.
31   * The phrases to be recognised should be listed in a set of files, one for
32   * each type of occurences.
33   * The gazeteer is build with the information from a file that contains the set
34   * of lists (which are files as well) and the associated type for each list.
35   * The file defining the set of lists should have the following syntax:
36   * each list definition should be written on its own line and should contain:
37   * <ol>
38   * <li>the file name (required) </li>
39   * <li>the major type (required) </li>
40   * <li>the minor type (optional)</li>
41   * <li>the language(s) (optional) </li>
42   * </ol>
43   * The elements of each definition are separated by &quot;:&quot;.
44   * The following is an example of a valid definition: <br>
45   * <code>personmale.lst:person:male:english</code>
46   * Each list file named in the lists definition file is just a list containing
47   * one entry per line.
48   * When this gazetter will be run over some input text (a Gate document) it
49   * will generate annotations of type Lookup having the attributes specified in
50   * the definition file.
51   */
52  public class DefaultGazetteer extends AbstractGazetteer {
53  
54    /** Debug flag
55     */
56    private static final boolean DEBUG = false;
57  
58    public static final String
59      DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document";
60  
61    public static final String
62      DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
63  
64    public static final String
65      DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL";
66  
67    public static final String
68      DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding";
69  
70    public static final String
71      DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
72  
73  
74    /** a map of nodes vs gaz lists */
75    private Map listsByNode;
76  
77    /** Build a gazetter using the default lists from the agte resources
78     * {@see init()}
79     */
80    public DefaultGazetteer(){
81    }
82  
83    /** Does the actual loading and parsing of the lists. This method must be
84     * called before the gazetteer can be used
85     */
86    public Resource init()throws ResourceInstantiationException{
87      fsmStates = new HashSet();
88      initialState = new FSMState(this);
89      if(listsURL == null){
90        throw new ResourceInstantiationException (
91              "No URL provided for gazetteer creation!");
92      }
93      definition = new LinearDefinition();
94      definition.setURL(listsURL);
95      definition.load();
96      int linesCnt = definition.size();
97      listsByNode = definition.loadLists();
98      Iterator inodes = definition.iterator();
99  
100     String line;
101     int nodeIdx = 0;
102     LinearNode node;
103     while (inodes.hasNext()) {
104       node = (LinearNode) inodes.next();
105       fireStatusChanged("Reading " + node.toString());
106       fireProgressChanged(++nodeIdx * 100 / linesCnt);
107       readList(node,true);
108     } // while iline
109     fireProcessFinished();
110     return this;
111   }
112 
113 
114   /** Reads one lists (one file) of phrases
115    *
116    * @param listDesc the line from the definition file
117    * @param add
118    * @add if <b>true</b> will add the phrases found in the list to the ones
119    *     recognised by this gazetter, if <b>false</b> the phrases found in the
120    *     list will be removed from the list of phrases recognised by this
121    *     gazetteer.
122    */
123   void readList(LinearNode node, boolean add) throws ResourceInstantiationException{
124     String listName, majorType, minorType, languages;
125     if ( null == node ) {
126       throw new ResourceInstantiationException(" LinearNode node is null ");
127     }
128 
129     listName = node.getList();
130     majorType = node.getMajorType();
131     minorType = node.getMinorType();
132     languages = node.getLanguage();
133     GazetteerList gazList = (GazetteerList)listsByNode.get(node);
134     if (null == gazList) {
135       throw new ResourceInstantiationException("gazetteer list not found by node");
136     }
137 
138     Iterator iline = gazList.iterator();
139 
140     Lookup lookup = new Lookup(listName,majorType, minorType, languages);
141     lookup.list = node.getList();
142     if ( null != mappingDefinition){
143       MappingNode mnode = mappingDefinition.getNodeByList(lookup.list);
144       if (null!=mnode){
145         lookup.oClass = mnode.getClassID();
146         lookup.ontology = mnode.getOntologyID();
147       }
148     }//if mapping def
149 
150     String line;
151     while(iline.hasNext()){
152       line = iline.next().toString();
153       if(add)addLookup(line, lookup);
154       else removeLookup(line, lookup);
155     }
156   } // void readList(String listDesc)
157 
158   /** Adds one phrase to the list of phrases recognised by this gazetteer
159    *
160    * @param text the phrase to be added
161    * @param lookup the description of the annotation to be added when this
162    *     phrase is recognised
163    */
164 // >>> DAM, was
165 /*
166   public void addLookup(String text, Lookup lookup) {
167     Character currentChar;
168     FSMState currentState = initialState;
169     FSMState nextState;
170     Lookup oldLookup;
171     boolean isSpace;
172 
173     for(int i = 0; i< text.length(); i++) {
174       isSpace = Character.isWhitespace(text.charAt(i));
175       if(isSpace) currentChar = new Character(' ');
176       else currentChar = (caseSensitive.booleanValue()) ?
177                           new Character(text.charAt(i)) :
178                           new Character(Character.toUpperCase(text.charAt(i))) ;
179       nextState = currentState.next(currentChar);
180       if(nextState == null){
181         nextState = new FSMState(this);
182         currentState.put(currentChar, nextState);
183         if(isSpace) nextState.put(new Character(' '),nextState);
184       }
185       currentState = nextState;
186     } //for(int i = 0; i< text.length(); i++)
187 
188     currentState.addLookup(lookup);
189     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
190 
191   } // addLookup
192 */
193 // >>> DAM: TransArray optimization
194   public void addLookup(String text, Lookup lookup) {
195     char currentChar;
196     FSMState currentState = initialState;
197     FSMState nextState;
198     Lookup oldLookup;
199     boolean isSpace;
200 
201     for(int i = 0; i< text.length(); i++) {
202         currentChar = text.charAt(i);
203         isSpace = Character.isWhitespace(currentChar);
204         if(isSpace) currentChar = ' ';
205         else currentChar = (caseSensitive.booleanValue()) ?
206                           currentChar :
207                           Character.toUpperCase(currentChar) ;
208       nextState = currentState.next(currentChar);
209       if(nextState == null){
210         nextState = new FSMState(this);
211         currentState.put(currentChar, nextState);
212         if(isSpace) nextState.put(' ',nextState);
213       }
214       currentState = nextState;
215     } //for(int i = 0; i< text.length(); i++)
216 
217     currentState.addLookup(lookup);
218     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
219 
220   } // addLookup
221 // >>> DAM, end
222 
223   /** Removes one phrase to the list of phrases recognised by this gazetteer
224    *
225    * @param text the phrase to be removed
226    * @param lookup the description of the annotation associated to this phrase
227    */
228 // >>> DAM, was
229 /*
230   public void removeLookup(String text, Lookup lookup) {
231     Character currentChar;
232     FSMState currentState = initialState;
233     FSMState nextState;
234     Lookup oldLookup;
235     boolean isSpace;
236 
237     for(int i = 0; i< text.length(); i++) {
238       isSpace = Character.isWhitespace(text.charAt(i));
239       if(isSpace) currentChar = new Character(' ');
240       else currentChar = new Character(text.charAt(i));
241       nextState = currentState.next(currentChar);
242       if(nextState == null) return;//nothing to remove
243       currentState = nextState;
244     } //for(int i = 0; i< text.length(); i++)
245     currentState.removeLookup(lookup);
246   } // removeLookup
247 */
248 // >>> DAM: TransArray optimization
249   public void removeLookup(String text, Lookup lookup) {
250     char currentChar;
251     FSMState currentState = initialState;
252     FSMState nextState;
253     Lookup oldLookup;
254 
255     for(int i = 0; i< text.length(); i++) {
256         currentChar = text.charAt(i);
257         if(Character.isWhitespace(currentChar)) currentChar = ' ';
258         nextState = currentState.next(currentChar);
259         if(nextState == null) return;//nothing to remove
260         currentState = nextState;
261     } //for(int i = 0; i< text.length(); i++)
262     currentState.removeLookup(lookup);
263   } // removeLookup
264 // >>> DAM, end
265 
266   /** Returns a string representation of the deterministic FSM graph using
267    * GML.
268    */
269   public String getFSMgml() {
270     String res = "graph[ \ndirected 1\n";
271     ///String nodes = "", edges = "";
272     StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
273                 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
274     Iterator fsmStatesIter = fsmStates.iterator();
275     while (fsmStatesIter.hasNext()){
276       FSMState currentState = (FSMState)fsmStatesIter.next();
277       int stateIndex = currentState.getIndex();
278       /*nodes += "node[ id " + stateIndex +
279                " label \"" + stateIndex;
280       */
281       nodes.append("node[ id ");
282       nodes.append(stateIndex);
283       nodes.append(" label \"");
284       nodes.append(stateIndex);
285 
286              if(currentState.isFinal()){
287               ///nodes += ",F\\n" + currentState.getLookupSet();
288               nodes.append(",F\\n");
289               nodes.append(currentState.getLookupSet());
290              }
291              ///nodes +=  "\"  ]\n";
292              nodes.append("\"  ]\n");
293       //edges += currentState.getEdgesGML();
294       edges.append(currentState.getEdgesGML());
295     }
296     res += nodes.toString() + edges.toString() + "]\n";
297     return res;
298   } // getFSMgml
299 
300 
301   /**
302    * Tests whether a character is internal to a word (i.e. if it's a letter or
303    * a combining mark (spacing or not)).
304    * @param ch the character to be tested
305    * @return a boolean value
306    */
307   public static boolean isWordInternal(char ch){
308     return Character.isLetter(ch) ||
309            Character.getType(ch) == Character.COMBINING_SPACING_MARK ||
310            Character.getType(ch) == Character.NON_SPACING_MARK;
311   }
312 
313   /**
314    * This method runs the gazetteer. It assumes that all the needed parameters
315    * are set. If they are not, an exception will be fired.
316    */
317   public void execute() throws ExecutionException{
318     interrupted = false;
319     AnnotationSet annotationSet;
320     //check the input
321     if(document == null) {
322       throw new ExecutionException(
323         "No document to process!"
324       );
325     }
326 
327     if(annotationSetName == null ||
328        annotationSetName.equals("")) annotationSet = document.getAnnotations();
329     else annotationSet = document.getAnnotations(annotationSetName);
330 
331     fireStatusChanged("Doing lookup in " +
332                            document.getName() + "...");
333     String content = document.getContent().toString();
334     int length = content.length();
335 // >>> DAM, was
336 /*
337     Character currentChar;
338 */
339 // >>> DAM: TransArray optimization
340     char currentChar;
341 // >>> DAM, end
342     FSMState currentState = initialState;
343     FSMState nextState;
344     FSMState lastMatchingState = null;
345     int matchedRegionEnd = 0;
346     int matchedRegionStart = 0;
347     int charIdx = 0;
348     int oldCharIdx = 0;
349     FeatureMap fm;
350     Lookup currentLookup;
351 
352 // >>> DAM, was
353 /*
354     while(charIdx < length) {
355       if(Character.isWhitespace(content.charAt(charIdx)))
356         currentChar = new Character(' ');
357       else currentChar = (caseSensitive.booleanValue()) ?
358                          new Character(content.charAt(charIdx)) :
359                          new Character(Character.toUpperCase(
360                                        content.charAt(charIdx)));
361 */
362 // >>> DAM: TransArray optimization
363     while(charIdx < length) {
364       currentChar = content.charAt(charIdx);
365       if(Character.isWhitespace(currentChar)) currentChar = ' ';
366       else currentChar = caseSensitive.booleanValue() ?
367                           currentChar :
368                           Character.toUpperCase(currentChar);
369 // >>> DAM, end
370       nextState = currentState.next(currentChar);
371       if(nextState == null) {
372         //the matching stopped
373 
374         //if we had a successful match then act on it;
375         if(lastMatchingState != null){
376           //let's add the new annotation(s)
377           Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
378 
379           while(lookupIter.hasNext()) {
380             currentLookup = (Lookup)lookupIter.next();
381             fm = Factory.newFeatureMap();
382             fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
383             if (null!= currentLookup.oClass && null!=currentLookup.ontology){
384               fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
385               fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
386             }
387             if(null != currentLookup.minorType) {
388               fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
389               if(null != currentLookup.languages)
390                 fm.put("language", currentLookup.languages);
391             }
392             try {
393               annotationSet.add(new Long(matchedRegionStart),
394                               new Long(matchedRegionEnd + 1),
395                               LOOKUP_ANNOTATION_TYPE,
396                               fm);
397             } catch(InvalidOffsetException ioe) {
398               throw new LuckyException(ioe.toString());
399             }
400           }//while(lookupIter.hasNext())
401           lastMatchingState = null;
402         }
403 
404         //reset the FSM
405         charIdx = matchedRegionStart + 1;
406         matchedRegionStart = charIdx;
407         currentState = initialState;
408 
409       } else{//go on with the matching
410         currentState = nextState;
411         //if we have a successful state then store it
412         if(currentState.isFinal() &&
413            (
414             (!wholeWordsOnly.booleanValue())
415              ||
416             ((matchedRegionStart == 0 ||
417              !isWordInternal(content.charAt(matchedRegionStart - 1)))
418              &&
419              (charIdx + 1 >= content.length()   ||
420              !isWordInternal(content.charAt(charIdx + 1)))
421             )
422            )
423           ){
424           matchedRegionEnd = charIdx;
425           lastMatchingState = currentState;
426         }
427         charIdx ++;
428         if(charIdx == content.length()){
429           //we can't go on, use the last matching state and restart matching
430           //from the next char
431           if(lastMatchingState != null){
432             //let's add the new annotation(s)
433             Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
434 
435             while(lookupIter.hasNext()) {
436               currentLookup = (Lookup)lookupIter.next();
437               fm = Factory.newFeatureMap();
438               fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
439               if (null!= currentLookup.oClass && null!=currentLookup.ontology){
440                 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
441                 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
442               }
443               if(null != currentLookup.minorType) {
444                 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
445                 if(null != currentLookup.languages)
446                   fm.put("language", currentLookup.languages);
447               }
448               try {
449                 annotationSet.add(new Long(matchedRegionStart),
450                                 new Long(matchedRegionEnd + 1),
451                                 LOOKUP_ANNOTATION_TYPE,
452                                 fm);
453               } catch(InvalidOffsetException ioe) {
454                 throw new LuckyException(ioe.toString());
455               }
456             }//while(lookupIter.hasNext())
457             lastMatchingState = null;
458           }
459 
460           //reset the FSM
461           charIdx = matchedRegionStart + 1;
462           matchedRegionStart = charIdx;
463           currentState = initialState;
464         }
465       }
466       if(charIdx - oldCharIdx > 256) {
467         fireProgressChanged((100 * charIdx )/ length );
468         oldCharIdx = charIdx;
469         if(isInterrupted()) throw new ExecutionInterruptedException(
470             "The execution of the " + getName() +
471             " gazetteer has been abruptly interrupted!");
472       }
473     } // while(charIdx < length)
474 
475     if(lastMatchingState != null) {
476       Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
477       while(lookupIter.hasNext()) {
478         currentLookup = (Lookup)lookupIter.next();
479         fm = Factory.newFeatureMap();
480         fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
481         if (null!= currentLookup.oClass && null!=currentLookup.ontology){
482           fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
483           fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
484         }
485 
486         if(null != currentLookup.minorType)
487           fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
488         try{
489           annotationSet.add(new Long(matchedRegionStart),
490                           new Long(matchedRegionEnd + 1),
491                           LOOKUP_ANNOTATION_TYPE,
492                           fm);
493         } catch(InvalidOffsetException ioe) {
494           throw new GateRuntimeException(ioe.toString());
495         }
496       }//while(lookupIter.hasNext())
497     }
498     fireProcessFinished();
499     fireStatusChanged("Lookup complete!");
500   } // execute
501 
502 
503   /** The initial state of the FSM that backs this gazetteer
504    */
505   FSMState initialState;
506 
507   /** A set containing all the states of the FSM backing the gazetteer
508    */
509   Set fsmStates;
510 
511   /**lookup <br>
512    * @param singleItem a single string to be looked up by the gazetteer
513    * @return set of the Lookups associated with the parameter*/
514   public Set lookup(String singleItem) {
515     char currentChar;
516     Set set = new HashSet();
517     FSMState currentState = initialState;
518     FSMState nextState;
519 
520     for(int i = 0; i< singleItem.length(); i++) {
521         currentChar = singleItem.charAt(i);
522         if(Character.isWhitespace(currentChar)) currentChar = ' ';
523         nextState = currentState.next(currentChar);
524         if(nextState == null) {
525           return set;
526         }
527         currentState = nextState;
528     } //for(int i = 0; i< text.length(); i++)
529     set = currentState.getLookupSet();
530     return set;
531   }
532 
533   public boolean remove(String singleItem) {
534     char currentChar;
535     FSMState currentState = initialState;
536     FSMState nextState;
537     Lookup oldLookup;
538 
539     for(int i = 0; i< singleItem.length(); i++) {
540         currentChar = singleItem.charAt(i);
541         if(Character.isWhitespace(currentChar)) currentChar = ' ';
542         nextState = currentState.next(currentChar);
543         if(nextState == null) {
544           return false;
545         }//nothing to remove
546         currentState = nextState;
547     } //for(int i = 0; i< text.length(); i++)
548     currentState.lookupSet = new HashSet();
549     return true;
550   }
551 
552   public boolean add(String singleItem, Lookup lookup) {
553     addLookup(singleItem,lookup);
554     return true;
555   }
556 
557 
558 } // DefaultGazetteer
559 
560 // >>> DAM: TransArray optimization, new charMap implementation
561 interface Iter
562 {
563     public boolean hasNext();
564     public char next();
565 } // iter class
566 
567 /**
568  * class implementing the map using binary serach by char as key
569  * to retrive the coresponding object.
570  */
571 class charMap
572 {
573     char[] itemsKeys = null;
574     Object[] itemsObjs = null;
575 
576     /**
577      * resize the containers by one leavaing empty elemant at position 'index'
578      */
579     void resize(int index)
580     {
581         int newsz = itemsKeys.length + 1;
582         char[] tempKeys = new char[newsz];
583         Object[] tempObjs = new Object[newsz];
584         int i;
585         for (i= 0; i < index; i++)
586         {
587             tempKeys[i] = itemsKeys[i];
588             tempObjs[i] = itemsObjs[i];
589         }
590         for (i= index+1; i < newsz; i++)
591         {
592             tempKeys[i] = itemsKeys[i-1];
593             tempObjs[i] = itemsObjs[i-1];
594         }
595 
596         itemsKeys = tempKeys;
597         itemsObjs = tempObjs;
598     } // resize
599 
600 /**
601  * get the object from the map using the char key
602  */
603     Object get(char key)
604     {
605         if (itemsKeys == null) return null;
606         int index = Arrays.binarySearch(itemsKeys, key);
607         if (index<0)
608             return null;
609         return itemsObjs[index];
610     }
611 /**
612  * put the object into the char map using the chat as the key
613  */
614     Object put(char key, Object value)
615     {
616         if (itemsKeys == null)
617         {
618             itemsKeys = new char[1];
619             itemsKeys[0] = key;
620             itemsObjs = new Object[1];
621             itemsObjs[0] = value;
622             return value;
623         }// if first time
624         int index = Arrays.binarySearch(itemsKeys, key);
625         if (index<0)
626         {
627             index = ~index;
628             resize(index);
629             itemsKeys[index] = key;
630             itemsObjs[index] = value;
631         }
632         return itemsObjs[index];
633     } // put
634 /**
635  * the keys itereator
636  * /
637     public Iter iter()
638     {
639         return new Iter()
640         {
641             int counter = 0;
642             public boolean hasNext() {return counter < itemsKeys.length;}
643             public char next() { return itemsKeys[counter];}
644         };
645     } // iter()
646  */
647 
648 } // class charMap
649 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState