1   /*
2    *  OrthoMatcher.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/August/2001
12   *
13   *  $Id: OrthoMatcher.java,v 1.45 2002/10/30 14:18:46 valyt Exp $
14   */
15  
16  
17  package gate.creole.orthomatcher;
18  
19  import gate.*;
20  import gate.util.*;
21  import gate.creole.*;
22  import gate.corpora.*;
23  import gate.annotation.*;
24  import java.util.*;
25  import java.io.*;
26  import java.net.*;
27  import gnu.regexp.*;
28  
29  public class OrthoMatcher extends AbstractLanguageAnalyser
30                            implements ANNIEConstants{
31  
32    public static final String
33      OM_DOCUMENT_PARAMETER_NAME = "document";
34  
35    public static final String
36      OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
37  
38    public static final String
39      OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
40  
41    public static final String
42      OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
43  
44    public static final String
45      OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
46  
47    public static final String
48      OM_PERSON_TYPE_PARAMETER_NAME = "personType";
49  
50    public static final String
51      OM_EXT_LISTS_PARAMETER_NAME = "extLists";
52  
53    protected static final String CDGLISTNAME = "cdg";
54    protected static final String ALIASLISTNAME = "alias";
55    protected static final String ARTLISTNAME = "def_art";
56    protected static final String PREPLISTNAME = "prepos";
57    protected static final String CONNECTORLISTNAME = "connector";
58    protected static final String SPURLISTNAME = "spur_match";
59  
60    protected static final String PUNCTUATION_VALUE = "punctuation";
61    protected static final String THE_VALUE = "The";
62  
63  
64    /**the name of the annotation set*/
65    protected String annotationSetName;
66  
67    /** the types of the annotation */
68    protected List annotationTypes = new ArrayList(10);
69  
70    /** the organization type*/
71    protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
72  
73    /** the person type*/
74    protected String personType = PERSON_ANNOTATION_TYPE;
75  
76    protected String unknownType = "Unknown";
77  
78    /** internal or external list */
79    protected boolean extLists = true;
80  
81    /** matching unknowns or not*/
82    protected boolean matchingUnknowns = true;
83  
84    /** This is an internal variable to indicate whether
85     *  we matched using a rule that requires that
86     *  the newly matched annotation matches all the others
87     *  This is needed, because organizations can share
88     *  first/last tokens like News and be different
89     */
90    private   boolean allMatchingNeeded = false;
91  
92    //** Orthomatching is not case-sensitive by default*/
93    protected boolean caseSensitive = false;
94  
95    protected FeatureMap queryFM = Factory.newFeatureMap();
96  
97  //  protected ExecutionException executionException;
98  
99    // name lookup tables (used for namematch)
100   //gave them bigger default size, coz rehash is expensive
101   protected HashMap alias = new HashMap(100);
102   protected HashSet cdg = new HashSet(50);
103   protected HashMap spur_match = new HashMap(100);
104   protected HashMap def_art = new HashMap(20);
105   protected HashMap connector = new HashMap(20);
106   protected HashMap prepos = new HashMap(30);
107 
108 
109   protected AnnotationSet nameAllAnnots = null;
110   protected HashMap processedAnnots = new HashMap(150);
111   protected HashMap annots2Remove = new HashMap(75);
112   protected List matchesDocFeature = new ArrayList();
113   //maps annotation ids to array lists of tokens
114   protected HashMap tokensMap = new HashMap(150);
115 
116   protected Annotation shortAnnot, longAnnot;
117 
118   protected ArrayList tokensLongAnnot, tokensShortAnnot;
119 
120   /** a feature map to be used when retrieving annotations
121    *  declared here so can be reused for efficiency
122    *  clear() before each use
123    */
124   protected FeatureMap tempMap = Factory.newFeatureMap();
125 
126   /** the size of the buffer */
127   private final static int BUFF_SIZE = 65000;
128 
129   /**
130    * URL to the file containing the definition for this orthomatcher
131    */
132   private java.net.URL definitionFileURL;
133 
134   /** The encoding used for the definition file and associated lists.*/
135   private String encoding;
136 
137   /** @link dependency */
138   /*#OrthoMatcher lnkOrthoMatcher;*/
139 
140   public OrthoMatcher () {
141     annotationTypes.add(organizationType);
142     annotationTypes.add(personType);
143     annotationTypes.add("Location");
144     annotationTypes.add("Date");
145   }
146 
147   /** Initialise this resource, and return it. */
148   public Resource init() throws ResourceInstantiationException {
149     //initialise the list of annotations which we will match
150     if(definitionFileURL == null){
151       throw new ResourceInstantiationException(
152                 "No URL provided for the definition file!");
153     }
154 
155     //at this point we have the definition file
156     try{
157       BufferedReader reader = new BufferedReader(
158                       new InputStreamReader(definitionFileURL.openStream(),
159                                             encoding));
160       String lineRead = null;
161       while ((lineRead = reader.readLine()) != null){
162         int index = lineRead.indexOf(":");
163         if (index != -1){
164           String nameFile = lineRead.substring(0,index);
165           String nameList = lineRead.substring(index+1,lineRead.length());
166           createAnnotList(nameFile,nameList);
167         }// if
168       }//while
169       reader.close();
170     }catch(IOException ioe){
171       throw new ResourceInstantiationException(ioe);
172     }
173 
174     return this;
175   } // init()
176 
177   /**  Run the resource. It doesn't make sense not to override
178     *  this in subclasses so the default implementation signals an
179     *  exception.
180     */
181   public void execute() throws ExecutionException{
182 
183     //check the input
184     if(document == null) {
185       throw new ExecutionException(
186         "No document for namematch!"
187       );
188     }
189 
190     // get the annotations from document
191     if ((annotationSetName == null)|| (annotationSetName.equals("")))
192       nameAllAnnots = document.getAnnotations();
193     else
194       nameAllAnnots = document.getAnnotations(annotationSetName);
195 
196     //if none found, print warning and exit
197     if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
198       Out.prln("OrthoMatcher Warning: No annotations found for processing");
199       return;
200     }
201 
202     //check if we've been run on this document before
203     //and clean the doc if needed
204     docCleanup();
205     Map matchesMap = (Map)document.getFeatures().
206                      get(DOCUMENT_COREF_FEATURE_NAME);
207 
208     // creates the cdg list from the document
209     //no need to create otherwise, coz already done in init()
210     if (!extLists)
211       buildTables(nameAllAnnots);
212 
213     //first match all name annotations
214     matchNameAnnotations();
215 
216     //then match the unknown ones to all name ones
217     if (matchingUnknowns)
218       matchUnknown();
219 
220     // set the matches of the document
221 //    determineMatchesDocument();
222     if (! matchesDocFeature.isEmpty()) {
223       if(matchesMap == null){
224         matchesMap = new HashMap();
225       }
226       matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
227       //we need to put it even if it was already present in order to triger
228       //the update events
229       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
230 
231       //cannot do clear() as this has already been put on the document
232       //so I need a new one for the next run of matcher
233       matchesDocFeature = new ArrayList();
234     }
235 
236 //    Out.prln("Processed strings" + processedAnnots.values());
237     //clean-up the internal data structures for next run
238     nameAllAnnots = null;
239     processedAnnots.clear();
240     annots2Remove.clear();
241     tokensMap.clear();
242     matchesDocFeature = new ArrayList();
243     longAnnot = null;
244     shortAnnot = null;
245     tokensLongAnnot = null;
246     tokensShortAnnot = null;
247 
248   } // run()
249 
250   protected void matchNameAnnotations() throws ExecutionException{
251     // go through all the annotation types
252     Iterator iterAnnotationTypes = annotationTypes.iterator();
253     while (iterAnnotationTypes.hasNext()) {
254       String annotationType = (String)iterAnnotationTypes.next();
255 
256       AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
257 
258       // continue if no such annotations exist
259       if ((nameAnnots == null) || nameAnnots.isEmpty())
260         continue;
261 
262       Iterator iterNames = nameAnnots.iterator();
263       while (iterNames.hasNext()) {
264         Annotation nameAnnot = (Annotation) iterNames.next();
265         Integer id = nameAnnot.getId();
266 
267         // get string and value
268         String annotString = null;
269         try {
270             annotString = document.getContent().getContent(
271             nameAnnot.getStartNode().getOffset(),
272             nameAnnot.getEndNode().getOffset()
273             ).toString();
274           // now do the reg. exp. substitutions
275           annotString = regularExpressions(annotString," ", "\\s+");
276 
277         } catch (InvalidOffsetException ioe) {
278             throw new ExecutionException
279                                    ("Invalid offset of the annotation");
280         }
281         //convert to lower case if we are not doing a case sensitive match
282         if (!caseSensitive)
283           annotString = annotString.toLowerCase();
284 
285         //get the tokens
286         List tokens = new ArrayList((Set)
287                         nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
288                           nameAnnot.getStartNode().getOffset(),
289                           nameAnnot.getEndNode().getOffset()
290                         ));
291         //if no tokens to match, do nothing
292         if (tokens.isEmpty())
293           continue;
294         Collections.sort(tokens, new gate.util.OffsetComparator());
295         //check if these actually do not end after the name
296         //needed coz new tokeniser conflates
297         //strings with dashes. So British Gas-style is two tokens
298         //instead of three. So cannot match properly British Gas
299 //        tokens = checkTokens(tokens);
300         tokensMap.put(nameAnnot.getId(), tokens);
301 
302 //        Out.prln("Matching annot " + nameAnnot + ": string " + annotString);
303 
304         //first check whether we have not matched such a string already
305         //if so, just consider it matched, don't bother calling the rules
306         if (processedAnnots.containsValue(annotString)) {
307 //          Out.prln("Contained string found " + annotString);
308           updateMatches(nameAnnot, annotString);
309           processedAnnots.put(nameAnnot.getId(), annotString);
310           continue;
311         } else if (processedAnnots.isEmpty()) {
312           processedAnnots.put(nameAnnot.getId(), annotString);
313           continue;
314         }
315 
316         //if a person, then remove their title before matching
317         if (nameAnnot.getType().equals(personType))
318           annotString = containTitle(annotString, nameAnnot);
319         else if (nameAnnot.getType().equals(organizationType))
320           annotString = stripCDG(annotString, nameAnnot);
321 
322         if(null == annotString || "".equals(annotString))
323           continue;
324 
325         //otherwise try matching with previous annotations
326         matchWithPrevious(nameAnnot, annotString);
327 
328 //        Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
329         //finally add the current annotations to the processed map
330         processedAnnots.put(nameAnnot.getId(), annotString);
331       }//while through name annotations
332 
333     }//while through annotation types
334 
335   }
336 
337   protected void matchUnknown() throws ExecutionException {
338     //get all Unknown annotations
339     AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
340 
341     if ((unknownAnnots == null) || unknownAnnots.isEmpty())
342       return;
343 
344     Iterator iter = unknownAnnots.iterator();
345     //loop through the unknown annots
346     while (iter.hasNext()) {
347       Annotation unknown = (Annotation) iter.next();
348 
349       // get string and value
350       String unknownString = null;
351       try {
352           unknownString = document.getContent().getContent(
353             unknown.getStartNode().getOffset(),
354             unknown.getEndNode().getOffset()
355             ).toString();
356         // now do the reg. exp. substitutions
357         unknownString = regularExpressions(unknownString," ", "\\s+");
358       } catch (InvalidOffsetException ioe) {
359           throw new ExecutionException
360                                  ("Invalid offset of the annotation");
361       }
362       //convert to lower case if we are not doing a case sensitive match
363       if (!caseSensitive)
364         unknownString = unknownString.toLowerCase();
365 
366       //get the tokens
367       List tokens = new ArrayList((Set)
368                       nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
369                         unknown.getStartNode().getOffset(),
370                         unknown.getEndNode().getOffset()
371                       ));
372       if (tokens.isEmpty())
373         continue;
374       Collections.sort(tokens, new gate.util.OffsetComparator());
375       tokensMap.put(unknown.getId(), tokens);
376 
377 
378       //first check whether we have not matched such a string already
379       //if so, just consider it matched, don't bother calling the rules
380       if (processedAnnots.containsValue(unknownString)) {
381         Annotation matchedAnnot = updateMatches(unknown, unknownString);
382 //        Out.prln("Matched " + unknown + "with string " + unknownString);
383 //        Out.prln("That's same as " + matchedAnnot);
384         if (matchedAnnot.getType().equals(unknownType)) {
385           annots2Remove.put(unknown.getId(),
386                             annots2Remove.get(matchedAnnot.getId()));
387         }
388         else
389           annots2Remove.put(unknown.getId(), matchedAnnot.getType());
390         processedAnnots.put(unknown.getId(), unknownString);
391         unknown.getFeatures().put("NMRule", unknownType);
392         continue;
393       }
394 
395       //check if we should do sub-string matching in case it's hyphenated
396       //for example US-led
397       if (tokens.size() == 1
398           && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
399         if (matchHyphenatedUnknowns(unknown, unknownString, iter))
400           continue;
401       }//if
402 
403       matchWithPrevious(unknown, unknownString);
404 
405     } //while though unknowns
406 
407     if (! annots2Remove.isEmpty()) {
408       Iterator unknownIter = annots2Remove.keySet().iterator();
409       while (unknownIter.hasNext()) {
410         Integer unknId = (Integer) unknownIter.next();
411         Annotation unknown = nameAllAnnots.get(unknId);
412         Integer newID = nameAllAnnots.add(
413           unknown.getStartNode(),
414           unknown.getEndNode(),
415           (String) annots2Remove.get(unknId),
416           unknown.getFeatures()
417         );
418         nameAllAnnots.remove(unknown);
419 
420         //change the id in the matches list
421         List mList = (List)unknown.getFeatures().
422                      get(ANNOTATION_COREF_FEATURE_NAME);
423         mList.remove(unknId);
424         mList.add(newID);
425       }//while
426     }//if
427   }
428 
429   private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
430                                        Iterator iter){
431     boolean matched = false;
432 
433     //only take the substring before the hyphen
434     int stringEnd = unknownString.indexOf("-");
435     unknownString = unknownString.substring(0, stringEnd);
436     //check if we've already matched this string
437     //because only exact match of the substring are considered
438     if (processedAnnots.containsValue(unknownString)) {
439       matched = true;
440       Annotation matchedAnnot = updateMatches(unknown, unknownString);
441       //only do the matching if not a person, because we do not match
442       //those on sub-strings
443       iter.remove();
444       String newType;
445       if (matchedAnnot.getType().equals(unknownType))
446         newType = (String)annots2Remove.get(matchedAnnot.getId());
447       else
448         newType = matchedAnnot.getType();
449 
450       Integer newID = new Integer(-1);
451       try {
452         newID = nameAllAnnots.add(
453           unknown.getStartNode().getOffset(),
454           new Long(unknown.getStartNode().getOffset().longValue()
455                   + stringEnd),
456           newType,
457           unknown.getFeatures()
458         );
459       } catch (InvalidOffsetException ex) {
460         throw new GateRuntimeException(ex.getMessage());
461       }
462       nameAllAnnots.remove(unknown);
463 
464       //change the id in the matches list
465       List mList = (List)unknown.getFeatures().
466                    get(ANNOTATION_COREF_FEATURE_NAME);
467       mList.remove(unknown.getId());
468       mList.add(newID);
469 
470     }
471     return matched;
472   }
473 
474   protected void matchWithPrevious(Annotation nameAnnot, String annotString) {
475     boolean matchedUnknown = false;
476 
477     Iterator prevIter = processedAnnots.keySet().iterator();
478     while (prevIter.hasNext()) {
479       Integer prevId = (Integer) prevIter.next();
480       Annotation prevAnnot = nameAllAnnots.get(prevId);
481 
482       //check if the two are from the same type or the new one is unknown
483       if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
484           && ! nameAnnot.getType().equals(unknownType))
485          )
486         continue;
487       //do not compare two unknown annotations either
488       //they are only matched to those of known types
489       if (  nameAnnot.getType().equals(unknownType)
490             && prevAnnot.getType().equals(unknownType))
491       continue;
492 
493       //check if we have already matched this annotation to the new one
494       if (matchedAlready(nameAnnot, prevAnnot) )
495         continue;
496 
497       //now changed to a rule, here we just match by gender
498       if (prevAnnot.getType().equals(personType)) {
499         String prevGender =
500           (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
501         String nameGender =
502           (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
503         if (   prevGender != null
504             && nameGender != null
505             && ( (nameGender.equalsIgnoreCase("female")
506                   &&
507                   prevGender.equalsIgnoreCase("male")
508                   )
509                ||
510                   (prevGender.equalsIgnoreCase("female")
511                    && nameGender.equalsIgnoreCase("male")
512                   )
513                 )
514             ) //if condition
515           continue; //we don't have a match if the two genders are different
516 
517       }//if
518 
519       //if the two annotations match
520       if (matchAnnotations(nameAnnot, annotString,  prevAnnot)) {
521 //        Out.prln("Matched " + shortName + "and " + longName);
522         updateMatches(nameAnnot, prevAnnot);
523         //if unknown annotation, we need to change to the new type
524         if (nameAnnot.getType().equals(unknownType)) {
525           matchedUnknown = true;
526           if (prevAnnot.getType().equals(unknownType))
527             annots2Remove.put(nameAnnot.getId(),
528                               annots2Remove.get(prevAnnot.getId()));
529           else
530             annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
531          //also put an attribute to indicate that
532           nameAnnot.getFeatures().put("NMRule", unknownType);
533         }//if unknown
534         break; //no need to match further
535       }//if annotations matched
536 
537     }//while through previous annotations
538 
539     if (matchedUnknown)
540       processedAnnots.put(nameAnnot.getId(), annotString);
541 
542 
543   }//matchWithPrevious
544 
545   protected boolean matchAnnotations(Annotation newAnnot, String annotString,
546                                      Annotation prevAnnot) {
547     //do not match two annotations that overlap
548     if (newAnnot.overlaps(prevAnnot))
549       return false;
550 
551     // find which annotation string of the two is longer
552     //  this is useful for some of the matching rules
553     String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
554 
555     String longName = prevAnnotString;
556     String shortName = annotString;
557     longAnnot = prevAnnot;
558     shortAnnot = newAnnot;
559 
560     if (shortName.length()>longName.length()) {
561       String temp = longName;
562       longName = shortName;
563       shortName = temp;
564       Annotation tempAnn = longAnnot;
565       longAnnot = shortAnnot;
566       shortAnnot = tempAnn;
567     }//if
568 
569     tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
570     tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
571 
572     List matchesList = (List) prevAnnot.getFeatures().
573                               get(ANNOTATION_COREF_FEATURE_NAME);
574     if (matchesList == null || matchesList.isEmpty())
575       return apply_rules_namematch(prevAnnot.getType(), shortName,longName);
576 
577     //if these two match, then let's see if all the other matching one will too
578     //that's needed, because sometimes names can share a token (e.g., first or
579     //last but not be the same
580     if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) {
581       /**
582        * Check whether we need to ensure that there is a match with the rest
583        * of the matching annotations, because the rule requires that
584        * transtivity is not assummed.
585        */
586       if (allMatchingNeeded) {
587         allMatchingNeeded = false;
588 
589         List toMatchList = new ArrayList(matchesList);
590   //      if (newAnnot.getType().equals(unknownType))
591   //        Out.prln("Matching new " + annotString + " with annots " + toMatchList);
592         toMatchList.remove(prevAnnot.getId());
593 
594         return matchOtherAnnots(toMatchList, newAnnot, annotString);
595       } else
596         return true;
597     }
598     return false;
599   }
600 
601   /** This method checkes whether the new annotation matches
602    *  all annotations given in the toMatchList (it contains ids)
603    *  The idea is that the new annotation needs to match all those,
604    *  because assuming transitivity does not always work, when
605    *  two different entities share a common token: e.g., BT Cellnet
606    *  and BT and British Telecom.
607   */
608   protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
609                                       String annotString) {
610 
611     //if the list is empty, then we're matching all right :-)
612     if (toMatchList.isEmpty())
613       return true;
614 
615     boolean matchedAll = true;
616     int i = 0;
617 
618     while (matchedAll && i < toMatchList.size()) {
619       Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
620 
621       // find which annotation string of the two is longer
622       //  this is useful for some of the matching rules
623       String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
624       if (prevAnnotString == null)
625         try {
626           prevAnnotString = document.getContent().getContent(
627             prevAnnot.getStartNode().getOffset(),
628             prevAnnot.getEndNode().getOffset()
629             ).toString();
630         } catch (InvalidOffsetException ioe) {
631           return false;
632         }//try
633 
634 
635       String longName = prevAnnotString;
636       String shortName = annotString;
637       longAnnot = prevAnnot;
638       shortAnnot = newAnnot;
639 
640       if (shortName.length()>=longName.length()) {
641         String temp = longName;
642         longName = shortName;
643         shortName = temp;
644         Annotation tempAnn = longAnnot;
645         longAnnot = shortAnnot;
646         shortAnnot = tempAnn;
647       }//if
648 
649       tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
650       tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
651 
652       matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName);
653 //      if (newAnnot.getType().equals(unknownType))
654 //        Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);
655 
656       i++;
657     }//while
658     return matchedAll;
659   }
660 
661 
662   protected boolean matchedAlready(Annotation annot1, Annotation annot2) {
663     //the two annotations are already matched if the matches list of the first
664     //contains the id of the second
665     List matchesList = (List) annot1.getFeatures().
666                        get(ANNOTATION_COREF_FEATURE_NAME);
667     if ((matchesList == null) || matchesList.isEmpty())
668       return false;
669     else if (matchesList.contains(annot2.getId()))
670       return true;
671     return false;
672   }
673 
674   protected Annotation updateMatches(Annotation newAnnot, String annotString) {
675     Annotation matchedAnnot = null;
676     Integer id;
677 
678     //first find a processed annotation with the same string
679     Iterator iter = processedAnnots.keySet().iterator();
680     while (iter.hasNext()) {
681       id = (Integer) iter.next();
682       String oldString = (String) processedAnnots.get(id);
683       if (annotString.equals(oldString)) {
684         matchedAnnot = nameAllAnnots.get(id);
685         break;
686       }//if
687     }//while
688 
689     if (matchedAnnot == null) return null;
690     //if the two matching annotations are of different type which is not
691     //unknown, do not match them
692     if (! matchedAnnot.getType().equals(newAnnot.getType())
693         && !newAnnot.getType().equals(unknownType) )
694       return matchedAnnot;
695 
696     List matchesList = (List) matchedAnnot.getFeatures().
697                        get(ANNOTATION_COREF_FEATURE_NAME);
698     if ((matchesList == null) || matchesList.isEmpty()) {
699       //no previous matches, so need to add
700       if (matchesList == null) {
701         matchesList = new ArrayList();
702         matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
703                                        matchesList);
704         matchesDocFeature.add(matchesList);
705       }//if
706       matchesList.add(matchedAnnot.getId());
707       matchesList.add(newAnnot.getId());
708     } else {
709       //just add the new annotation
710       matchesList.add(newAnnot.getId());
711     }//if
712     //add the matches list to the new annotation
713     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
714     return matchedAnnot;
715   }
716 
717   protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) {
718 
719     List matchesList = (List) prevAnnot.getFeatures().
720                               get(ANNOTATION_COREF_FEATURE_NAME);
721     if ((matchesList == null) || matchesList.isEmpty()) {
722       //no previous matches, so need to add
723       if (matchesList == null) {
724         matchesList = new ArrayList();
725         prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
726         matchesDocFeature.add(matchesList);
727       }//if
728       matchesList.add(prevAnnot.getId());
729       matchesList.add(newAnnot.getId());
730     } else {
731       //just add the new annotation
732       matchesList.add(newAnnot.getId());
733     }//if
734     //add the matches list to the new annotation
735     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
736     //propagate the gender if two persons are matched
737     if (prevAnnot.getType().equals(personType)) {
738       String prevGender =
739         (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
740       String newGender =
741         (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
742       boolean unknownPrevGender = isUnknownGender(prevGender);
743       boolean unknownNewGender = isUnknownGender(newGender);
744       if (unknownPrevGender && !unknownNewGender)
745         prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender);
746       else if (unknownNewGender && !unknownPrevGender)
747         newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender);
748     }//if
749   }
750 
751 
752   protected void docCleanup() {
753     Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
754     if (matchesValue != null && (matchesValue instanceof Map))
755       ((Map)matchesValue).remove(nameAllAnnots.getName());
756     else if (matchesValue != null) {
757       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap());
758     }
759 
760     //get all annotations that have a matches feature
761     HashSet fNames = new HashSet();
762     fNames.add(ANNOTATION_COREF_FEATURE_NAME);
763     AnnotationSet annots =
764                   nameAllAnnots.get(null, fNames);
765 
766 //    Out.prln("Annots to cleanup" + annots);
767 
768     if (annots == null || annots.isEmpty())
769       return;
770 
771     Iterator iter = annots.iterator();
772     while (iter.hasNext()) {
773       while (iter.hasNext())
774         ((Annotation) iter.next()).getFeatures().
775                                    remove(ANNOTATION_COREF_FEATURE_NAME);
776     } //while
777   }//cleanup
778 
779   /** return a person name without title */
780   protected String containTitle (String annotString, Annotation annot)
781                       throws ExecutionException {
782     // get the offsets
783     Long startAnnot = annot.getStartNode().getOffset();
784     Long endAnnot = annot.getEndNode().getOffset();
785 
786     // determine "Lookup" annotation set
787     queryFM.clear();
788     queryFM.put("majorType", "title");
789     AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot);
790     if (as1 == null || as1.isEmpty())
791       return annotString;
792     AnnotationSet as =
793       as1.get("Lookup", queryFM);
794     if (as !=null && ! as.isEmpty()) {
795       List titles = new ArrayList((Set)as);
796       Collections.sort(titles, new gate.util.OffsetComparator());
797 
798       Iterator iter = titles.iterator();
799       while (iter.hasNext()) {
800         Annotation titleAnn = (Annotation)(iter.next());
801 
802         //we've not found a title at the start offset,
803         //there's no point in looking further
804         //coz titles come first
805         if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
806           return annotString;
807 
808         try {
809           // the title from the current annotation
810           String annotTitle =
811             document.getContent().getContent(
812               titleAnn.getStartNode().getOffset(),
813               titleAnn.getEndNode().getOffset()
814             ).toString();
815 
816           // eliminate the title from annotation string and return the result
817           if (annotTitle.length()<annotString.length()) {
818             //remove from the array of tokens, so then we can compare properly
819             //the remaining tokens
820 //            Out.prln("Removing title from: " + annot + " with string " + annotString);
821 //            Out.prln("Tokens are" + tokensMap.get(annot.getId()));
822 //            Out.prln("Title is" + annotTitle);
823             ((ArrayList) tokensMap.get(annot.getId())).remove(0);
824             return annotString.substring(
825                                  annotTitle.length()+1,annotString.length());
826           }
827         } catch (InvalidOffsetException ioe) {
828             throw new ExecutionException
829                                ("Invalid offset of the annotation");
830         }//try
831       }// while
832     }//if
833     return annotString;
834 
835   }
836 
837   /** return an organization  without a designator and starting The*/
838   protected String stripCDG (String annotString, Annotation annot){
839 
840     ArrayList tokens = (ArrayList) tokensMap.get(annot.getId());
841 
842     //strip starting The first
843     if ( ((String) ((Annotation) tokens.get(0)
844           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
845           .equalsIgnoreCase(THE_VALUE))
846       tokens.remove(0);
847 
848     //no need to check for cdg if there is only 1 token or less
849     if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1)
850           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
851       tokens.remove(tokens.size()-1);
852 
853     StringBuffer newString = new StringBuffer(50);
854     for (int i = 0; i < tokens.size(); i++){
855       newString.append((String) ((Annotation) tokens.get(i)
856           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
857       if (i != tokens.size()-1)
858         newString.append(" ");
859     }
860 //    Out.prln("Strip CDG returned: " + newString + "for string " + annotString);
861 
862     if (caseSensitive)
863       return newString.toString();
864 
865     return newString.toString().toLowerCase();
866   }
867 
868 /*
869   public void check() throws ExecutionException {
870     if (executionException != null) {
871       ExecutionException e = executionException;
872       executionException = null;
873       throw e;
874     }
875   } // check()
876 */
877 
878   /** if ( == false) then reads the names of files in order
879     *  to create the lookup tables
880     */
881 //  protected void createLists() throws IOException {
882 //
883 //    InputStream inputStream = Files.getGateResourceAsStream(
884 //                                              "creole/namematcher/listsNM.def");
885 //    InputStreamReader inputStreamReader = new InputStreamReader (
886 //                                                    inputStream);
887 //    BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
888 //
889 //    String lineRead = null;
890 //    while ((lineRead = bufferedReader.readLine()) != null){
891 //      int index = lineRead.indexOf(":");
892 //      if (index != -1){
893 //        String nameFile = lineRead.substring(0,index);
894 //        String nameList = lineRead.substring(index+1,lineRead.length());
895 //        createAnnotList(nameFile,nameList);
896 //      }// if
897 //    }//while
898 //    bufferedReader.close();
899 //    inputStreamReader.close();
900 //    inputStream.close();
901 //  }// createLists()
902 
903   /** creates the lookup tables */
904   protected void createAnnotList(String nameFile,String nameList)
905                                                           throws IOException{
906 
907 //    InputStream inputStream = Files.getGateResourceAsStream(
908 //                                              "creole/namematcher/"+nameFile);
909 //    InputStreamReader inputStreamReader = new InputStreamReader (
910 //                                                    inputStream);
911 //    BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
912 
913     //create the relative URL
914     URL fileURL = new URL(definitionFileURL, nameFile);
915     BufferedReader bufferedReader =
916       new BufferedReader(new InputStreamReader(fileURL.openStream(),
917                          encoding));
918 
919     String lineRead = null;
920     while ((lineRead = bufferedReader.readLine()) != null){
921       if (nameList.compareTo(CDGLISTNAME)==0){
922         if (caseSensitive)
923           cdg.add(lineRead);
924         else
925           cdg.add(lineRead.toLowerCase());
926       }// if
927       else {
928         int index = lineRead.indexOf("ý");
929         if (index != -1){
930           String  expr = lineRead.substring(0,index);
931           //if not case-sensitive, we need to downcase all strings
932           if (!caseSensitive)
933             expr = expr.toLowerCase();
934           String code = lineRead.substring(index+1,lineRead.length());
935           if (nameList.equals(ALIASLISTNAME))
936                             alias.put(expr, code);
937           else
938           if (nameList.equals(ARTLISTNAME))
939                             def_art.put(expr, code);
940           else
941           if (nameList.equals(PREPLISTNAME))
942                             prepos.put(expr, code);
943           else
944           if (nameList.equals(CONNECTORLISTNAME))
945                             connector.put(expr, code);
946           else
947           if (nameList.equals(SPURLISTNAME))
948                             spur_match.put(expr, code);
949 
950         }//if
951       }// else
952 
953     }//while
954   }//createAnnotList
955 
956 
957   /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
958   private boolean apply_rules_namematch(String annotationType, String shortName,
959                                         String longName) {
960     // first apply rule for spurius matches i.e. rule0
961     if (matchRule0(longName, shortName))
962       return false;
963     if (
964          (// rules for all annotations
965           //no longer use rule1, coz I do the check for same string via the
966           //hash table
967             matchRule2(longName, shortName)
968          ||
969             matchRule3(longName, shortName)
970          ) // rules for all annotations
971          ||
972          (// rules for organisation annotations
973              ( annotationType.equals(organizationType)
974                //ACE addition
975                || annotationType.equals("Facility"))
976              &&
977              (    matchRule4(longName, shortName)
978                ||
979                   matchRule5(longName, shortName)
980                ||
981                   matchRule6(longName, shortName)
982                ||
983                   matchRule7(longName, shortName)
984                ||
985 //                  matchRule8(longName, shortName)
986 //               ||
987                   matchRule9(longName, shortName)
988                ||
989                   matchRule10(longName, shortName)
990                ||
991                   matchRule11(longName, shortName)
992                ||
993                   matchRule12(longName, shortName)
994                ||
995                   matchRule13(shortName, longName)
996               )
997            )// rules for organisation annotations
998          ||
999          (// rules for person annotations
1000             (    annotationType.equals(personType))
1001               &&
1002             (    matchRule4(longName, shortName)
1003               ||
1004                  matchRule5(longName, shortName)
1005               ||
1006                  matchRule14(longName, shortName)
1007               || //kalina: added this, so it matches names when contain more
1008                  //than one first and one last name
1009                  matchRule15(longName, shortName)
1010              )
1011          )// rules for person annotations
1012         ) //if
1013      return true;
1014    return false;
1015  }//apply_rules
1016
1017
1018  /** set the extLists flag */
1019  public void setExtLists(Boolean newExtLists) {
1020    extLists = newExtLists.booleanValue();
1021  }//setextLists
1022
1023  /** set the caseSensitive flag */
1024  public void setCaseSensitive(Boolean newCase) {
1025    caseSensitive = newCase.booleanValue();
1026  }//setextLists
1027
1028  /** set the annotation set name*/
1029  public void setAnnotationSetName(String newAnnotationSetName) {
1030    annotationSetName = newAnnotationSetName;
1031  }//setAnnotationSetName
1032
1033  /** set the types of the annotations*/
1034  public void setAnnotationTypes(List newType) {
1035    annotationTypes = newType;
1036  }//setAnnotationTypes
1037
1038  /** set whether to process the Unknown annotations*/
1039  public void setProcessUnknown(Boolean processOrNot) {
1040    this.matchingUnknowns = processOrNot.booleanValue();
1041  }//setAnnotationTypes
1042
1043  public void setOrganizationType(String newOrganizationType) {
1044    organizationType = newOrganizationType;
1045  }//setOrganizationType
1046
1047  public void setPersonType(String newPersonType) {
1048    personType = newPersonType;
1049  }//setPersonType
1050
1051  /**get the name of the annotation set*/
1052  public String getAnnotationSetName() {
1053    return annotationSetName;
1054  }//getAnnotationSetName
1055
1056  /** get the types of the annotation*/
1057  public List getAnnotationTypes() {
1058    return annotationTypes;
1059  }//getAnnotationTypes
1060
1061  public String getOrganizationType() {
1062    return organizationType;
1063  }
1064
1065  public String getPersonType() {
1066    return personType;
1067  }
1068
1069  public Boolean getExtLists() {
1070    return new Boolean(extLists);
1071  }
1072
1073  /** Are we running in a case-sensitive mode?*/
1074  public Boolean getCaseSensitive() {
1075    return new Boolean(caseSensitive);
1076  }
1077
1078  /** Return whether or not we're processing the Unknown annots*/
1079  public Boolean getProcessUnknown() {
1080    return new Boolean(matchingUnknowns);
1081  }
1082
1083/*
1084  public List getMatchesDocument() {
1085    return matchesDocument;
1086  }
1087*/
1088
1089  protected boolean isUnknownGender(String gender) {
1090    if (gender == null)
1091      return true;
1092    if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
1093      return false;
1094    return true;
1095
1096  } //isUnknownGender
1097
1098  /** RULE #0: If the two names are listed in table of
1099    * spurius matches then they do NOT match
1100    * Condition(s): -
1101    * Applied to: all name annotations
1102    */
1103  public boolean matchRule0(String s1,
1104           String s2) {
1105    if (spur_match.containsKey(s1)
1106        && spur_match.containsKey(s2) )
1107      return
1108        spur_match.get(s1).toString().equals(spur_match.get(s2).toString());
1109
1110    return false;
1111  }//matchRule0
1112
1113  /** RULE #1: If the two names are identical then they are the same
1114    * no longer used, because I do the check for same string via the
1115    * hash table of previous annotations
1116    * Condition(s): depend on case
1117    * Applied to: all name annotations
1118    */
1119  public boolean matchRule1(String s1,
1120           String s2,
1121           boolean matchCase) {
1122//    Out.prln("Rule1: Matching " + s1 + "and " + s2);
1123
1124    boolean matched = false;
1125    if (!matchCase)
1126        matched = s1.equalsIgnoreCase(s2);
1127    else matched =  s1.equals(s2) ;
1128//kalina: do not remove, nice for debug
1129//    if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth")))
1130//        Out.prln("Rule1: Matched " + s1 + "and " + s2);
1131    return matched;
1132  }//matchRule1
1133
1134
1135  /**
1136    * RULE #2: if the two names are listed as equivalent in the
1137    * lookup table (alias) then they match
1138    * Condition(s): -
1139    * Applied to: all name annotations
1140    */
1141  public boolean matchRule2(String s1,
1142           String s2) {
1143
1144    if (alias.containsKey(s1) && alias.containsKey(s2))
1145      return (alias.get(s1).toString().equals(alias.get(s2).toString()));
1146
1147    return false;
1148  }//matchRule2
1149
1150  /**
1151    * RULE #3: adding a possessive at the end
1152    * of one name causes a match
1153    * e.g. "Standard and Poor" == "Standard and Poor's"
1154    * and also "Standard and Poor" == "Standard's"
1155    * Condition(s): case-insensitive match
1156    * Applied to: all name annotations
1157    */
1158  public boolean matchRule3(String s1, //long string
1159                             String s2) { //short string
1160
1161    if (s2.endsWith("'s") || s2.endsWith("'")
1162        ||(s1.endsWith("'s")|| s1.endsWith("'"))) {
1163
1164
1165      String s2_poss = null;
1166
1167      if (!s2.endsWith("'s")) s2_poss = s2.concat("'s");
1168      else s2_poss = s2.concat("'");
1169
1170      if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true;
1171
1172      // now check the second case i.e. "Standard and Poor" == "Standard's"
1173      String token = (String)
1174        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1175
1176      if (!token.endsWith("'s")) s2_poss = token.concat("'s");
1177      else s2_poss = token.concat("'");
1178
1179      if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true;
1180
1181    } // if (s2.endsWith("'s")
1182    return false;
1183  }//matchRule3
1184
1185  /**
1186    * RULE #4: Do all tokens other than the punctuation marks
1187    * , and . match?
1188    * e.g. "Smith, Jones" == "Smith Jones"
1189    * Condition(s): case-insensitive match
1190    * Applied to: organisation and person annotations
1191    */
1192  public boolean matchRule4(String s1,
1193           String s2) {
1194
1195    boolean allTokensMatch = true;
1196
1197    Iterator tokensLongAnnotIter = tokensLongAnnot.iterator();
1198    Iterator tokensShortAnnotIter = tokensShortAnnot.iterator();
1199    while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) {
1200      Annotation token = (Annotation) tokensLongAnnotIter.next();
1201      if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE))
1202        continue;
1203//      Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot);
1204      if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1205             ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) {
1206        allTokensMatch = false;
1207        break;
1208      } // if (!tokensLongAnnot.nextToken()
1209    } // while
1210//    if (allTokensMatch)
1211//      Out.prln("rule4 fired. result is: " + allTokensMatch);
1212    return allTokensMatch;
1213  }//matchRule4
1214
1215  /**
1216    * RULE #5: if the 1st token of one name
1217    * matches the second name
1218    * e.g. "Pepsi Cola" == "Pepsi"
1219    * Condition(s): case-insensitive match
1220    * Applied to: all name annotations
1221    */
1222  public boolean matchRule5(String s1,
1223           String s2) {
1224
1225    //do not match numbers by this rule
1226    if (tokensLongAnnot.size()> 1 &&
1227        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number"))
1228      return false;
1229
1230//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) {
1231//      Out.prln("Rule 5: " + s1 + "and " + s2);
1232//    }
1233
1234    //require that when matching person names, the shorter one to be of length 1
1235    //for the rule to apply. In other words, avoid matching Peter Smith and
1236    //Peter Kline, because they share a Peter token.
1237    if ( (shortAnnot.getType().equals(personType)
1238         || longAnnot.getType().equals(personType)
1239         )
1240       &&
1241         tokensShortAnnot.size()>1
1242       )
1243       return false;
1244
1245    if (tokensLongAnnot.size()<=1)
1246      return false;
1247    boolean result = matchRule1((String)
1248                      ((Annotation) tokensLongAnnot.get(0)
1249                        ).getFeatures().get(TOKEN_STRING_FEATURE_NAME),
1250                      s2,
1251                      caseSensitive);
1252
1253//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick"))
1254//      Out.prln("rule 5 result: " + result);
1255    return result;
1256
1257  }//matchRule5
1258
1259  /**
1260    * RULE #6: if one name is the acronym of the other
1261    * e.g. "Imperial Chemical Industries" == "ICI"
1262    * Applied to: organisation annotations only
1263    */
1264  public boolean matchRule6(String s1,
1265           String s2) {
1266
1267    int i = 0;
1268
1269    //check and if the shorted string has a space in it, then it's not
1270    //an acronym
1271    if (s2.indexOf(" ") > 0)
1272      return false;
1273
1274    //Out.prln("Acronym: Matching " + s1 + "and " + s2);
1275    StringBuffer acronym_s1 = new StringBuffer("");
1276    StringBuffer acronymDot_s1 = new StringBuffer("");
1277
1278    for ( ;i < tokensLongAnnot.size(); i++ ) {
1279      String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i)
1280                         ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1);
1281      acronym_s1.append(toAppend);
1282      acronymDot_s1.append(toAppend);
1283      acronymDot_s1.append(".");
1284    }
1285
1286    //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2);
1287    //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive));
1288
1289    if (matchRule1(acronym_s1.toString(),s2,caseSensitive) ||
1290        matchRule1(acronymDot_s1.toString(),s2,caseSensitive) )
1291      return true;
1292
1293    return false;
1294  }//matchRule6
1295
1296  /**
1297    * RULE #7: if one of the tokens in one of the
1298    * names is in the list of separators eg. "&"
1299    * then check if the token before the separator
1300    * matches the other name
1301    * e.g. "R.H. Macy & Co." == "Macy"
1302    * Condition(s): case-sensitive match
1303    * Applied to: organisation annotations only
1304    */
1305  public boolean matchRule7(String s1,
1306           String s2) {
1307
1308    //don't try it unless the second string is just one token
1309    if (tokensShortAnnot.size() != 1)
1310      return false;
1311
1312    String previous_token = null;
1313
1314    for (int i = 0;  i < tokensLongAnnot.size(); i++ ) {
1315      if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i)
1316          ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) {
1317        previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1)
1318                                    ).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1319
1320        break;
1321      }
1322    }
1323
1324    //now match previous_token with other name
1325    if (previous_token != null) {
1326//      if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1327//        Out.prln("Rule7");
1328      return matchRule1(previous_token,s2,caseSensitive);
1329
1330    }
1331    return false;
1332  }//matchRule7
1333
1334  /**
1335   * This rule is now obsolete, as The and the trailing CDG
1336   * are stripped before matching.
1337   * DO NOT CALL!!!
1338   *
1339    * RULE #8: if the names match, ignoring The and
1340    * and trailing company designator (which have already been stripped)
1341    * e.g. "The Magic Tricks Co." == "Magic Tricks"
1342    * Condition(s): case-sensitive match
1343    * Applied to: organisation annotations only
1344    */
1345  public boolean matchRule8(String s1,
1346           String s2) {
1347    Out.prln("OrthoMatcher warning: This rule has been discontinued!");
1348/*
1349    if (s1.startsWith("The ")) s1 = s1.substring(4);
1350    if (s2.startsWith("The ")) s2 = s2.substring(4);
1351
1352    // check that cdg is not empty
1353    if (!cdg.isEmpty()) {
1354      String stringToTokenize1 = s1;
1355      StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," ");
1356
1357      String stringToTokenize2 = s2;
1358      StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," ");
1359      String token = null;
1360      String cdg1 = null;
1361      String cdg2 = null;
1362
1363      s1 = "";
1364      s2 = "";
1365
1366      //check last token of s1
1367      while (tokensLongAnnot.hasMoreTokens()) {
1368        token = tokensLongAnnot.nextToken();
1369        if (!tokensLongAnnot.hasMoreTokens()
1370            && cdg.contains(token)) cdg1=token;
1371        else s1 = s1+token;
1372      }
1373
1374      // do the same for s2
1375      while (tokensShortAnnot.hasMoreTokens()) {
1376        token = tokensShortAnnot.nextToken();
1377        if (!tokensShortAnnot.hasMoreTokens()
1378          && cdg.contains(token)) cdg2=token;
1379        else s2 = s2+token;
1380      }
1381
1382      // if the company designators are different
1383      // then they are NOT the same organisations
1384      if ((cdg1!=null && cdg2!=null)
1385    && !cdg1.equalsIgnoreCase(cdg2)) return false;
1386    }
1387    if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive);
1388*/
1389    return false;
1390
1391  }//matchRule8
1392
1393  /**
1394    * RULE #9: does one of the names match the token
1395    * just before a trailing company designator
1396    * in the other name?
1397    * The company designator has already been chopped off,
1398    * so the token before it, is in fact the last token
1399    * e.g. "R.H. Macy Co." == "Macy"
1400    * Applied to: organisation annotations only
1401    */
1402  public boolean matchRule9(String s1,
1403           String s2) {
1404
1405//    if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news"))
1406//      Out.prln("Rule 9 " + s1 + " and " + s2);
1407    String s1_short = (String)
1408                      ((Annotation) tokensLongAnnot.get(
1409                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1410//    Out.prln("Converted to " + s1_short);
1411    if (tokensLongAnnot.size()>1) {
1412      boolean matched = matchRule1(s1_short, s2, caseSensitive);
1413      //we need to make sure all names match, instead of assuming transitivity,
1414      //to avoid matching BBC News with News then News with ITV News, which
1415      //by transitivity leads to BBC News matching ITV News which is not what
1416      //we want
1417      if (matched)
1418        allMatchingNeeded = true;
1419      return matched;
1420    } //if
1421
1422    return false;
1423  }//matchRule9
1424
1425  /**
1426    * RULE #10: is one name the reverse of the other
1427    * reversing around prepositions only?
1428    * e.g. "Department of Defence" == "Defence Department"
1429    * Condition(s): case-sensitive match
1430    * Applied to: organisation annotations only
1431    */
1432  public boolean matchRule10(String s1,
1433            String s2) {
1434
1435    String token = null;
1436    String previous_token = null;
1437    String next_token = null;
1438    boolean invoke_rule=false;
1439
1440    if (tokensLongAnnot.size() >= 3
1441        && tokensShortAnnot.size() >= 2) {
1442
1443      // first get the tokens before and after the preposition
1444      int i = 0;
1445      for (; i< tokensLongAnnot.size(); i++) {
1446        token = (String)
1447                  ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1448        if (prepos.containsKey(token)) {
1449          invoke_rule=true;
1450          break;
1451        }//if
1452        previous_token = token;
1453      }//while
1454
1455      if (! invoke_rule)
1456        return false;
1457
1458      if (i < tokensLongAnnot.size()
1459          && previous_token != null)
1460        next_token= (String)
1461                    ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1462      else return false;
1463
1464      String s21 = (String)
1465                    ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1466      String s22 = (String)
1467                    ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1468      // then compare (in reverse) with the first two tokens of s2
1469      if (matchRule1(next_token,(String) s21,caseSensitive)
1470          && matchRule1(previous_token, s22,caseSensitive))
1471        return true ;
1472    }//if (tokensLongAnnot.countTokens() >= 3
1473    return false;
1474  }//matchRule10
1475
1476  /**
1477    * RULE #11: does one name consist of contractions
1478    * of the first two tokens of the other name?
1479    * e.g. "Communications Satellite" == "ComSat"
1480    * and "Pan American" == "Pan Am"
1481    * Condition(s): case-sensitive match
1482    * Applied to: organisation annotations only
1483    */
1484  public boolean matchRule11(String s1,
1485            String s2) {
1486
1487
1488    // first do the easy case e.g. "Pan American" == "Pan Am"
1489
1490    String token11 = null;
1491    String token12 = null;
1492    String token21 = null;
1493    String token22 = null;
1494
1495    if (tokensLongAnnot.size() < 2)
1496      return false;
1497
1498    // 1st get the first two tokens of s1
1499    token11 = (String)
1500                ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1501    token12 = (String)
1502                ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1503
1504    // now check for the first case i.e. "Pan American" == "Pan Am"
1505    if (tokensShortAnnot.size() == 2)  {
1506
1507      token21 = (String)
1508                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1509      token22 = (String)
1510                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1511
1512      if (token11.startsWith(token21)
1513          && token12.startsWith(token22))
1514        return true;
1515
1516    } // if (tokensShortAnnot.countTokens() == 2)
1517
1518    // now the second case e.g.  "Communications Satellite" == "ComSat"
1519    else if (tokensShortAnnot.size()==1 && s2.length()>=3) {
1520
1521      // split the token into possible contractions
1522      // ignore case for matching
1523      for (int i=2;i<s2.length();i++) {
1524        token21=s2.substring(0,i+1);
1525        token22=s2.substring(i+1);
1526
1527        if (token11.startsWith(token21)
1528            && token12.startsWith(token22))
1529          return true;
1530      }// for
1531    } // else if
1532
1533    return false;
1534  }//matchRule11
1535
1536  /**
1537    * RULE #12: do the first and last tokens of one name
1538    * match the first and last tokens of the other?
1539    * Condition(s): case-sensitive match
1540    * Applied to: organisation annotations only
1541    */
1542  public boolean matchRule12(String s1,
1543            String s2) {
1544
1545    // first do the easy case e.g. "Pan American" == "Pan Am"
1546
1547    if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) {
1548//     Out.prln("Rule 12");
1549
1550      // get first and last tokens of s1 & s2
1551      String s1_first = (String)
1552                     ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1553      String s2_first = (String)
1554                     ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1555
1556      if (!matchRule1(s1_first,s2_first,caseSensitive))
1557        return false;
1558
1559      String s1_last = (String)
1560         ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1561      String s2_last = (String)
1562         ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1563
1564      return matchRule1(s1_last,s2_last,caseSensitive);
1565    } // if (tokensLongAnnot.countTokens()>1
1566    return false;
1567  }//matchRule12
1568
1569  /**
1570    * RULE #13: do multi-word names match except for
1571    * one token e.g.
1572    * "Second Force Recon Company" == "Force Recon Company"
1573    * Note that this rule has NOT been used in LaSIE's 1.5
1574    * namematcher
1575    * Restrictions: - remove cdg first
1576    *               - shortest name should be 2 words or more
1577    *               - if N is the number of tokens of the longest
1578    *                 name, then N-1 tokens should be matched
1579    * Condition(s): case-sensitive match
1580    * Applied to: organisation or person annotations only
1581    */
1582  public boolean matchRule13(String s1,
1583            String s2) {
1584
1585
1586    String token1 = null;
1587    String token2 = null;
1588
1589    int matched_tokens = 0, mismatches = 0;;
1590
1591    // if names < 2 words then rule is invalid
1592    if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false;
1593
1594//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1595//      Out.prln("Rule 13: Matching tokens" + tokensLongAnnot);
1596//      Out.prln("with tokens " + tokensShortAnnot);
1597//    }
1598
1599    // now do the matching
1600    for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) {
1601
1602//      Out.prln("i = " + i);
1603//      Out.prln("j = " + j);
1604      if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1605           ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) {
1606        matched_tokens++;
1607        j++;
1608      } else
1609        mismatches++;
1610    } // for
1611
1612    if (matched_tokens >= tokensLongAnnot.size()-1)
1613      return true;
1614
1615    return false;
1616  }//matchRule13
1617
1618  /**
1619    * RULE #14: if the last token of one name
1620    * matches the second name
1621    * e.g. "Hamish Cunningham" == "Cunningham"
1622    * Condition(s): case-insensitive match
1623    * Applied to: all person annotations
1624    */
1625  public boolean matchRule14(String s1,
1626           String s2) {
1627
1628//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1629//      Out.prln("Rule 14 " + s1 + " and " + s2);
1630    String s1_short = (String)
1631                      ((Annotation) tokensLongAnnot.get(
1632                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1633//    Out.prln("Converted to " + s1_short);
1634    if (tokensLongAnnot.size()>1)
1635      return matchRule1(s1_short,
1636                      s2,
1637                      caseSensitive);
1638
1639    return false;
1640
1641  }//matchRule14
1642
1643  /**
1644    * RULE #15: does one token from a Person name appear as the other token
1645    * Note that this rule has NOT been used in LaSIE's 1.5
1646    * namematcher; added for ACE by Di's request
1647    */
1648  public boolean matchRule15(String s1,
1649            String s2) {
1650
1651    int matched_tokens = 0;
1652
1653    // if names < 2 words then rule is invalid
1654
1655//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1656//      Out.prln("Rule 15:" );
1657//      Out.prln("with tokens " + tokensShortAnnot);
1658//    }
1659
1660    // now do the matching
1661    Annotation token1, token2;
1662    for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) {
1663      token1 = (Annotation) tokensShortAnnot.get(i);
1664      //first check if not punctuation, because we need to skip it
1665      if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1666        continue;
1667
1668      for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) {
1669//      Out.prln("i = " + i);
1670        token2 = (Annotation) tokensLongAnnot.get(j);
1671        if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1672          continue;
1673        if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1674             token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
1675          matched_tokens++;
1676      }//for
1677    } // for
1678
1679    //19 February 2002: kalina
1680    //was originally > 0 (i.e., any match is good)
1681    //ensure that we've matched all the tokens in the short annotation
1682    //the reason for that is, because otherwise we match
1683    //Patrick Viera and Patrick Somebody - not good!
1684    if (matched_tokens == tokensShortAnnot.size())
1685      return true;
1686
1687    return false;
1688  }//matchRule15
1689
1690
1691  /** Tables for namematch info
1692    * (used by the namematch rules)
1693    */
1694  private void buildTables(AnnotationSet nameAllAnnots) {
1695
1696    //reset the tables first
1697    cdg.clear();
1698
1699    if (! extLists) {
1700    // i.e. get cdg from Lookup annotations
1701      // get all Lookup annotations
1702      tempMap.clear();
1703      tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
1704      //now get all lookup annotations which are cdg
1705      AnnotationSet nameAnnots =
1706        nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
1707
1708      if ((nameAnnots ==null) || nameAnnots.isEmpty())
1709        return;
1710
1711      Iterator iter = nameAnnots.iterator();
1712      while (iter.hasNext()) {
1713         Annotation annot = (Annotation)iter.next();
1714         // get the actual string
1715         Long offsetStartAnnot = annot.getStartNode().getOffset();
1716         Long offsetEndAnnot = annot.getEndNode().getOffset();
1717         try {
1718           gate.Document doc = nameAllAnnots.getDocument();
1719           String annotString =
1720                            doc.getContent().getContent(
1721                            offsetStartAnnot,offsetEndAnnot
1722                            ).toString();
1723                cdg.add(annotString);
1724         } catch (InvalidOffsetException ioe) {
1725             ioe.printStackTrace(Err.getPrintWriter());
1726         }
1727      }// while
1728    }//if
1729  }//buildTables
1730
1731  /** substitute all multiple spaces, tabes and newlines
1732    * with a single space
1733    */
1734  public String regularExpressions ( String text, String replacement,
1735                                      String regEx) {
1736    String result = text;
1737    try {
1738      RE re = new RE(regEx);
1739      result = re.substituteAll( text,replacement);
1740    } catch (REException ree) {ree.printStackTrace();}
1741    return result;
1742  }
1743
1744  public void setDefinitionFileURL(java.net.URL definitionFileURL) {
1745    this.definitionFileURL = definitionFileURL;
1746  }
1747
1748  public java.net.URL getDefinitionFileURL() {
1749    return definitionFileURL;
1750  }
1751  public void setEncoding(String encoding) {
1752    this.encoding = encoding;
1753  }
1754  public String getEncoding() {
1755    return encoding;
1756  }//regularExpressions
1757
1758
1759  private static class Class1 {
1760  }
1761} // public class OrthoMatcher
1762
1763