BasicAnnotationOrthography.java
001 package gate.creole.orthomatcher;
002 
003 import static gate.creole.ANNIEConstants.ANNOTATION_COREF_FEATURE_NAME;
004 import static gate.creole.ANNIEConstants.LOOKUP_ANNOTATION_TYPE;
005 import static gate.creole.orthomatcher.OrthoMatcherHelper.getStringForSpan;
006 import static gate.creole.orthomatcher.OrthoMatcherHelper.round2Places;
007 import gate.Annotation;
008 import gate.AnnotationSet;
009 import gate.Document;
010 import gate.Factory;
011 import gate.FeatureMap;
012 import gate.creole.ExecutionException;
013 import gate.util.BomStrippingInputStreamReader;
014 import gate.util.Err;
015 import gate.util.InvalidOffsetException;
016 
017 import java.io.BufferedReader;
018 import java.io.IOException;
019 import java.net.URL;
020 import java.util.ArrayList;
021 import java.util.Arrays;
022 import java.util.Collections;
023 import java.util.HashMap;
024 import java.util.HashSet;
025 import java.util.Iterator;
026 import java.util.List;
027 import java.util.Map;
028 import java.util.Set;
029 import java.util.regex.Pattern;
030 
031 import org.apache.commons.io.IOUtils;
032 import org.apache.log4j.Logger;
033 
034 /*
035  * This class defines an orthography which defines the primary behaviour of the
036  * Orthomatcher processing resource in GATE.
037  */
038 public class BasicAnnotationOrthography implements AnnotationOrthography {
039   private final boolean extLists;
040 
041   private final String personType;
042 
043   private final String unknownType;
044 
045   private Map<String, Set<String>> nicknameMap =
046       new HashMap<String, Set<String>>();
047 
048   private final Double minimumNicknameLikelihood;
049 
050   public BasicAnnotationOrthography(String personType, boolean extLists,
051       String unknownType, URL nicknameFile, Double minimumNicknameLikelihood,
052       String encoding) {
053     this.personType = personType;
054     this.extLists = extLists;
055     this.unknownType = unknownType;
056     this.minimumNicknameLikelihood = minimumNicknameLikelihood;
057     try {
058       if(nicknameFile != nullthis.initNicknames(encoding, nicknameFile);
059     catch(IOException e) {
060       log.warn("Could not load nickname map.", e);
061     }
062   }
063 
064   protected static final Logger log = Logger
065       .getLogger(BasicAnnotationOrthography.class);
066 
067   @Override
068   public String getStringForAnnotation(Annotation a, gate.Document d)
069       throws ExecutionException {
070     String annotString =
071         getStringForSpan(a.getStartNode().getOffset(), a.getEndNode()
072             .getOffset(), d);
073     // now do the reg. exp. substitutions
074     annotString = annotString.replaceAll("\\s+"" ");
075     return annotString;
076   }
077 
078   @Override
079   public boolean fuzzyMatch(String s1, String s2) {
080     String s1Lower = s1.toLowerCase();
081     String s2Lower = s2.toLowerCase();
082     if(s1Lower.equals(s2Lower)) { return true}
083     // System.out.println("Now comparing " + s1 + " | " + s2) ;
084     Set<String> formalNameSet = nicknameMap.get(s1Lower);
085     if(formalNameSet != null) {
086       if(formalNameSet.contains(s2Lower)) { return true}
087     }
088     formalNameSet = nicknameMap.get(s2Lower);
089     if(formalNameSet != null) {
090       if(formalNameSet.contains(s1Lower)) { return true}
091     }
092     return false;
093   }
094 
095   /**
096    @return true if all of the tokens in firstName are either found in second
097    *         name or are stop words
098    */
099   @Override
100   public boolean allNonStopTokensInOtherAnnot(List<Annotation> firstName,
101       List<Annotation> secondName, String TOKEN_STRING_FEATURE_NAME,
102       boolean caseSensitive) {
103     for(Annotation a : firstName) {
104       if(!a.getFeatures().containsKey("ortho_stop")) {
105         String aString = (String)a.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
106         boolean foundAMatchInSecond = false;
107         for(Annotation b : secondName) {
108           if(OrthoMatcherHelper.straightCompare(aString, (String)b
109               .getFeatures().get(TOKEN_STRING_FEATURE_NAME), caseSensitive)) {
110             foundAMatchInSecond = true;
111             break;
112           }
113         }
114         if(!foundAMatchInSecond) { return false}
115       }
116     }
117     return true;
118   }
119 
120   /**
121    * Return a person name without a title. Also remove title from global
122    * variable tokensMap
123    */
124   @Override
125   public String stripPersonTitle(String annotString, Annotation annot,
126       Document doc, Map<Integer, List<Annotation>> tokensMap,
127       Map<Integer,List<Annotation>> normalizedTokensMap, AnnotationSet nameAllAnnots)
128       throws ExecutionException {
129     FeatureMap queryFM = Factory.newFeatureMap();
130     // get the offsets
131     Long startAnnot = annot.getStartNode().getOffset();
132     Long endAnnot = annot.getEndNode().getOffset();
133     // determine "Lookup" annotation set
134     queryFM.clear();
135     queryFM.put("majorType""title");
136     AnnotationSet as1 = nameAllAnnots.getContained(startAnnot, endAnnot);
137     if(as1 == null || as1.isEmpty()) return annotString;
138     AnnotationSet as = as1.get("Lookup", queryFM);
139     if(as != null && !as.isEmpty()) {
140       List<Annotation> titles = new ArrayList<Annotation>(as);
141       Collections.sort(titles, new gate.util.OffsetComparator());
142       Iterator<Annotation> iter = titles.iterator();
143       while(iter.hasNext()) {
144         Annotation titleAnn = iter.next();
145         // we've not found a title at the start offset,
146         // there's no point in looking further
147         // coz titles come first
148         if(titleAnn.getStartNode().getOffset().compareTo(startAnnot!= 0)
149           return annotString;
150         try {
151           // the title from the current annotation
152           String annotTitle =
153               doc.getContent()
154                   .getContent(titleAnn.getStartNode().getOffset(),
155                       titleAnn.getEndNode().getOffset()).toString();
156           // eliminate the title from annotation string and return the result
157           if(annotTitle.length() < annotString.length()) {
158             // remove from the array of tokens, so then we can compare properly
159             // the remaining tokens
160             // log.debug("Removing title from: " + annot + " with string " +
161             // annotString);
162             // log.debug("Tokens are " + tokensMap.get(annot.getId()));
163             // log.debug("Title is " + annotTitle);
164             tokensMap.get(annot.getId()).remove(0);
165             normalizedTokensMap.get(annot.getId()).remove(0);
166             return annotString.substring(annotTitle.length() 1,
167                 annotString.length());
168           }
169         catch(InvalidOffsetException ioe) {
170           throw new ExecutionException("Invalid offset of the annotation");
171         }// try
172       }// while
173     }// if
174     return annotString;
175   }
176 
177   @Override
178   public boolean matchedAlready(Annotation annot1, Annotation annot2,
179       List<List<Integer>> matchesDocFeature, AnnotationSet nameAllAnnots) {
180     // the two annotations are already matched if the matches list of the first
181     // contains the id of the second
182     @SuppressWarnings("unchecked")
183     List<Integer> matchesList =
184         (List<Integer>)annot1.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
185     if((matchesList == null|| matchesList.isEmpty())
186       return false;
187     else if(matchesList.contains(annot2.getId())) return true;
188     return false;
189   }
190 
191   @Override
192   public Annotation updateMatches(Annotation newAnnot, String annotString,
193       Map<Integer, String> processedAnnots, AnnotationSet nameAllAnnots,
194       List<List<Integer>> matchesDocFeature) {
195     Annotation matchedAnnot = null;
196     Integer id;
197     // first find a processed annotation with the same string
198     // TODO: Andrew Borthwick 7/26/08: The below is very inefficient. We should
199     // be doing a lookup into a hash
200     // which is indexed on string rather than testing every id. Need to have the
201     // index be String + Type
202     // for safety
203     Iterator<Integer> iter = processedAnnots.keySet().iterator();
204     // System.out.println("ID's examined: ");
205     while(iter.hasNext()) {
206       id = iter.next();
207       String oldString = processedAnnots.get(id);
208       // System.out.print(id + " ");
209       if(annotString.equals(oldString)) {
210         Annotation tempAnnot = nameAllAnnots.get(id);
211         if(tempAnnot == null) {
212           log.debug("Orthomatcher: TempAnnot is null when looking at "
213               + annotString + " | " + oldString + " | old id: " + id);
214           return null;
215         }
216         // Below is a new Spock addition to prevent unpredictable behavior when
217         // the same string is given more than one type. We want to return null
218         // if there is no match on name + type (other than Unknown)
219         if(newAnnot.getType().equals(unknownType)
220             || tempAnnot.getType().equals(newAnnot.getType())) {
221           matchedAnnot = tempAnnot;
222           break;
223         }
224       }
225     }// while
226      // System.out.println();
227     if(matchedAnnot == nullreturn null;
228     @SuppressWarnings("unchecked")
229     List<Integer> matchesList =
230         (List<Integer>)matchedAnnot.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
231     if((matchesList == null|| matchesList.isEmpty()) {
232       // no previous matches, so need to add
233       if(matchesList == null) {
234         matchesList = new ArrayList<Integer>();
235         matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
236             matchesList);
237         matchesDocFeature.add(matchesList);
238       }// if
239       matchesList.add(matchedAnnot.getId());
240       matchesList.add(newAnnot.getId());
241     else {
242       // just add the new annotation
243       matchesList.add(newAnnot.getId());
244     }// if
245      // add the matches list to the new annotation
246     newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
247         matchesList);
248     return matchedAnnot;
249   }
250 
251   @Override
252   public void updateMatches(Annotation newAnnot, Annotation prevAnnot,
253       List<List<Integer>> matchesDocFeature, AnnotationSet nameAllAnnots) {
254     @SuppressWarnings("unchecked")
255     List<Integer> matchesList =
256         (List<Integer>)prevAnnot.getFeatures().get(
257             OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
258     if((matchesList == null|| matchesList.isEmpty()) {
259       // no previous matches, so need to add
260       if(matchesList == null) {
261         matchesList = new ArrayList<Integer>();
262         prevAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
263             matchesList);
264         matchesDocFeature.add(matchesList);
265       }// if
266       matchesList.add(prevAnnot.getId());
267       matchesList.add(newAnnot.getId());
268     else {
269       // just add the new annotation
270       matchesList.add(newAnnot.getId());
271     }// if
272      // add the matches list to the new annotation
273     newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
274         matchesList);
275     // propagate the gender if two persons are matched
276     if(prevAnnot.getType().equals(this.personType)) {
277       String prevGender =
278           (String)prevAnnot.getFeatures().get(
279               OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
280       String newGender =
281           (String)newAnnot.getFeatures().get(
282               OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
283       boolean unknownPrevGender = isUnknownGender(prevGender);
284       boolean unknownNewGender = isUnknownGender(newGender);
285       if(unknownPrevGender && !unknownNewGender)
286         prevAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME,
287             newGender);
288       else if(unknownNewGender && !unknownPrevGender)
289         newAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME,
290             prevGender);
291     }// if
292   }
293 
294   /**
295    * Tables for namematch info (used by the namematch rules)
296    */
297   @Override
298   public Set<String> buildTables(AnnotationSet nameAllAnnots) {
299     FeatureMap tempMap = Factory.newFeatureMap();
300     // reset the tables first
301     Set<String> cdg = new HashSet<String>();
302     if(!extLists) {
303       // i.e. get cdg from Lookup annotations
304       // get all Lookup annotations
305       tempMap.clear();
306       tempMap.put(gate.creole.ANNIEConstants.LOOKUP_MAJOR_TYPE_FEATURE_NAME,
307           "cdg");
308       // now get all lookup annotations which are cdg
309       AnnotationSet nameAnnots =
310           nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
311       if((nameAnnots == null|| nameAnnots.isEmpty()) return cdg;
312       Iterator<Annotation> iter = nameAnnots.iterator();
313       while(iter.hasNext()) {
314         Annotation annot = iter.next();
315         // get the actual string
316         Long offsetStartAnnot = annot.getStartNode().getOffset();
317         Long offsetEndAnnot = annot.getEndNode().getOffset();
318         try {
319           gate.Document doc = nameAllAnnots.getDocument();
320           String annotString =
321               doc.getContent().getContent(offsetStartAnnot, offsetEndAnnot)
322                   .toString();
323           cdg.add(annotString);
324         catch(InvalidOffsetException ioe) {
325           ioe.printStackTrace(Err.getPrintWriter());
326         }
327       }// while
328     }// if
329     return cdg;
330   }// buildTables
331 
332   @Override
333   public boolean isUnknownGender(String gender) {
334     if(gender == nullreturn true;
335     if(gender.equalsIgnoreCase("male"|| gender.equalsIgnoreCase("female"))
336       return false;
337     return true;
338   // isUnknownGender
339 
340   protected Map<String, Set<String>> initNicknames(String nicknameFileEncoding,
341           java.net.URL fileURLthrows IOException {
342     Pattern spacePat = Pattern.compile("(\\s+)");
343     nicknameMap = new HashMap<String, Set<String>>();
344     // create the relative URL
345     BufferedReader reader = null;
346     try {
347       reader = new BomStrippingInputStreamReader(fileURL.openStream(),
348               nicknameFileEncoding);
349       String lineRead = null;
350 
351       while((lineRead = reader.readLine()) != null) {
352         if(lineRead.length() == || lineRead.charAt(0== '#') {
353           continue;
354         }
355         List<String> nickNameLine =
356                 Arrays.asList(spacePat.split(lineRead
357                         .toLowerCase().trim()));
358         if(nickNameLine.size() != 3
359                 && (nickNameLine.size() != && ((nickNameLine.get(3!= "M"|| nickNameLine
360                         .get(3!= "F"))) {
361           continue;
362         }
363         if(round2Places(Double.valueOf(nickNameLine.get(2))) < OrthoMatcherHelper
364                 .round2Places(minimumNicknameLikelihood)) {
365           continue;
366         }
367         if(nicknameMap.containsKey(nickNameLine.get(0))) {
368           /*
369            * System.out.println("Adding to existing nickname of " +
370            * nickNameLine.get(0) + " " + nickNameLine.get(1));
371            */
372           nicknameMap.get(nickNameLine.get(0)).add(nickNameLine.get(1));
373         else {
374           /*
375            * System.out.println("Adding new nickname of " +
376            * nickNameLine.get(0) + " " + nickNameLine.get(1));
377            */
378           nicknameMap.put(
379                   nickNameLine.get(0),
380                   new HashSet<String>(
381                           Collections.singleton(nickNameLine.get(1))));
382         }
383       }
384     finally {
385       IOUtils.closeQuietly(reader);
386     }
387     return nicknameMap;
388   }
389 }