1
15
16
17 package gate.creole.orthomatcher;
18
19 import java.io.*;
20 import java.net.URL;
21 import java.util.*;
22
23 import gate.*;
24 import gate.creole.*;
25 import gate.util.*;
26
27 import gnu.regexp.RE;
28 import gnu.regexp.REException;
29
30 public class OrthoMatcher extends AbstractLanguageAnalyser
31 implements ANNIEConstants{
32
33 public static final String
34 OM_DOCUMENT_PARAMETER_NAME = "document";
35
36 public static final String
37 OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
38
39 public static final String
40 OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
41
42 public static final String
43 OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
44
45 public static final String
46 OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
47
48 public static final String
49 OM_PERSON_TYPE_PARAMETER_NAME = "personType";
50
51 public static final String
52 OM_EXT_LISTS_PARAMETER_NAME = "extLists";
53
54 protected static final String CDGLISTNAME = "cdg";
55 protected static final String ALIASLISTNAME = "alias";
56 protected static final String ARTLISTNAME = "def_art";
57 protected static final String PREPLISTNAME = "prepos";
58 protected static final String CONNECTORLISTNAME = "connector";
59 protected static final String SPURLISTNAME = "spur_match";
60
61 protected static final String PUNCTUATION_VALUE = "punctuation";
62 protected static final String THE_VALUE = "The";
63
64
65
66 protected String annotationSetName;
67
68
69 protected List annotationTypes = new ArrayList(10);
70
71
72 protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
73
74
75 protected String personType = PERSON_ANNOTATION_TYPE;
76
77 protected String unknownType = "Unknown";
78
79
80 protected boolean extLists = true;
81
82
83 protected boolean matchingUnknowns = true;
84
85
91 private boolean allMatchingNeeded = false;
92
93 protected boolean caseSensitive = false;
95
96 protected FeatureMap queryFM = Factory.newFeatureMap();
97
98
100 protected HashMap alias = new HashMap(100);
103 protected HashSet cdg = new HashSet(50);
104 protected HashMap spur_match = new HashMap(100);
105 protected HashMap def_art = new HashMap(20);
106 protected HashMap connector = new HashMap(20);
107 protected HashMap prepos = new HashMap(30);
108
109
110 protected AnnotationSet nameAllAnnots = null;
111 protected HashMap processedAnnots = new HashMap(150);
112 protected HashMap annots2Remove = new HashMap(75);
113 protected List matchesDocFeature = new ArrayList();
114 protected HashMap tokensMap = new HashMap(150);
116
117 protected Annotation shortAnnot, longAnnot;
118
119 protected ArrayList tokensLongAnnot, tokensShortAnnot;
120
121
125 protected FeatureMap tempMap = Factory.newFeatureMap();
126
127
128 private final static int BUFF_SIZE = 65000;
129
130
133 private java.net.URL definitionFileURL;
134
135
136 private String encoding;
137
138
139
140
141 public OrthoMatcher () {
142 annotationTypes.add(organizationType);
143 annotationTypes.add(personType);
144 annotationTypes.add("Location");
145 annotationTypes.add("Date");
146 }
147
148
149 public Resource init() throws ResourceInstantiationException {
150 if(definitionFileURL == null){
152 throw new ResourceInstantiationException(
153 "No URL provided for the definition file!");
154 }
155
156 try{
158 BufferedReader reader = new BufferedReader(
159 new InputStreamReader(definitionFileURL.openStream(),
160 encoding));
161 String lineRead = null;
162 while ((lineRead = reader.readLine()) != null){
163 int index = lineRead.indexOf(":");
164 if (index != -1){
165 String nameFile = lineRead.substring(0,index);
166 String nameList = lineRead.substring(index+1,lineRead.length());
167 createAnnotList(nameFile,nameList);
168 } } reader.close();
171 }catch(IOException ioe){
172 throw new ResourceInstantiationException(ioe);
173 }
174
175 return this;
176 }
178
182 public void execute() throws ExecutionException{
183
184 if(document == null) {
186 throw new ExecutionException(
187 "No document for namematch!"
188 );
189 }
190
191 if ((annotationSetName == null)|| (annotationSetName.equals("")))
193 nameAllAnnots = document.getAnnotations();
194 else
195 nameAllAnnots = document.getAnnotations(annotationSetName);
196
197 if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
199 Out.prln("OrthoMatcher Warning: No annotations found for processing");
200 return;
201 }
202
203 docCleanup();
206 Map matchesMap = (Map)document.getFeatures().
207 get(DOCUMENT_COREF_FEATURE_NAME);
208
209 if (!extLists)
212 buildTables(nameAllAnnots);
213
214 matchNameAnnotations();
216
217 if (matchingUnknowns)
219 matchUnknown();
220
221 if (! matchesDocFeature.isEmpty()) {
224 if(matchesMap == null){
225 matchesMap = new HashMap();
226 }
227 matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
228 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
231
232 matchesDocFeature = new ArrayList();
235 }
236
237 nameAllAnnots = null;
240 processedAnnots.clear();
241 annots2Remove.clear();
242 tokensMap.clear();
243 matchesDocFeature = new ArrayList();
244 longAnnot = null;
245 shortAnnot = null;
246 tokensLongAnnot = null;
247 tokensShortAnnot = null;
248
249 }
251 protected void matchNameAnnotations() throws ExecutionException{
252 Iterator iterAnnotationTypes = annotationTypes.iterator();
254 while (iterAnnotationTypes.hasNext()) {
255 String annotationType = (String)iterAnnotationTypes.next();
256
257 AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
258
259 if ((nameAnnots == null) || nameAnnots.isEmpty())
261 continue;
262
263 Iterator iterNames = nameAnnots.iterator();
264 while (iterNames.hasNext()) {
265 Annotation nameAnnot = (Annotation) iterNames.next();
266 Integer id = nameAnnot.getId();
267
268 String annotString = null;
270 try {
271 annotString = document.getContent().getContent(
272 nameAnnot.getStartNode().getOffset(),
273 nameAnnot.getEndNode().getOffset()
274 ).toString();
275 annotString = regularExpressions(annotString," ", "\\s+");
277
278 } catch (InvalidOffsetException ioe) {
279 throw new ExecutionException
280 ("Invalid offset of the annotation");
281 }
282 if (!caseSensitive)
284 annotString = annotString.toLowerCase();
285
286 List tokens = new ArrayList((Set)
288 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
289 nameAnnot.getStartNode().getOffset(),
290 nameAnnot.getEndNode().getOffset()
291 ));
292 if (tokens.isEmpty())
294 continue;
295 Collections.sort(tokens, new gate.util.OffsetComparator());
296 tokensMap.put(nameAnnot.getId(), tokens);
302
303
305 if (processedAnnots.containsValue(annotString)) {
308 updateMatches(nameAnnot, annotString);
310 processedAnnots.put(nameAnnot.getId(), annotString);
311 continue;
312 } else if (processedAnnots.isEmpty()) {
313 processedAnnots.put(nameAnnot.getId(), annotString);
314 continue;
315 }
316
317 if (nameAnnot.getType().equals(personType))
319 annotString = containTitle(annotString, nameAnnot);
320 else if (nameAnnot.getType().equals(organizationType))
321 annotString = stripCDG(annotString, nameAnnot);
322
323 if(null == annotString || "".equals(annotString))
324 continue;
325
326 matchWithPrevious(nameAnnot, annotString);
328
329 processedAnnots.put(nameAnnot.getId(), annotString);
332 }
334 }
336 }
337
338 protected void matchUnknown() throws ExecutionException {
339 AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
341
342 if ((unknownAnnots == null) || unknownAnnots.isEmpty())
343 return;
344
345 Iterator iter = unknownAnnots.iterator();
346 while (iter.hasNext()) {
348 Annotation unknown = (Annotation) iter.next();
349
350 String unknownString = null;
352 try {
353 unknownString = document.getContent().getContent(
354 unknown.getStartNode().getOffset(),
355 unknown.getEndNode().getOffset()
356 ).toString();
357 unknownString = regularExpressions(unknownString," ", "\\s+");
359 } catch (InvalidOffsetException ioe) {
360 throw new ExecutionException
361 ("Invalid offset of the annotation");
362 }
363 if (!caseSensitive)
365 unknownString = unknownString.toLowerCase();
366
367 List tokens = new ArrayList((Set)
369 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
370 unknown.getStartNode().getOffset(),
371 unknown.getEndNode().getOffset()
372 ));
373 if (tokens.isEmpty())
374 continue;
375 Collections.sort(tokens, new gate.util.OffsetComparator());
376 tokensMap.put(unknown.getId(), tokens);
377
378
379 if (processedAnnots.containsValue(unknownString)) {
382 Annotation matchedAnnot = updateMatches(unknown, unknownString);
383 if (matchedAnnot.getType().equals(unknownType)) {
386 annots2Remove.put(unknown.getId(),
387 annots2Remove.get(matchedAnnot.getId()));
388 }
389 else
390 annots2Remove.put(unknown.getId(), matchedAnnot.getType());
391 processedAnnots.put(unknown.getId(), unknownString);
392 unknown.getFeatures().put("NMRule", unknownType);
393 continue;
394 }
395
396 if (tokens.size() == 1
399 && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
400 if (matchHyphenatedUnknowns(unknown, unknownString, iter))
401 continue;
402 }
404 matchWithPrevious(unknown, unknownString);
405
406 }
408 if (! annots2Remove.isEmpty()) {
409 Iterator unknownIter = annots2Remove.keySet().iterator();
410 while (unknownIter.hasNext()) {
411 Integer unknId = (Integer) unknownIter.next();
412 Annotation unknown = nameAllAnnots.get(unknId);
413 Integer newID = nameAllAnnots.add(
414 unknown.getStartNode(),
415 unknown.getEndNode(),
416 (String) annots2Remove.get(unknId),
417 unknown.getFeatures()
418 );
419 nameAllAnnots.remove(unknown);
420
421 List mList = (List)unknown.getFeatures().
423 get(ANNOTATION_COREF_FEATURE_NAME);
424 mList.remove(unknId);
425 mList.add(newID);
426 } } }
429
430 private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
431 Iterator iter){
432 boolean matched = false;
433
434 int stringEnd = unknownString.indexOf("-");
436 unknownString = unknownString.substring(0, stringEnd);
437 if (processedAnnots.containsValue(unknownString)) {
440 matched = true;
441 Annotation matchedAnnot = updateMatches(unknown, unknownString);
442 iter.remove();
445 String newType;
446 if (matchedAnnot.getType().equals(unknownType))
447 newType = (String)annots2Remove.get(matchedAnnot.getId());
448 else
449 newType = matchedAnnot.getType();
450
451 Integer newID = new Integer(-1);
452 try {
453 newID = nameAllAnnots.add(
454 unknown.getStartNode().getOffset(),
455 new Long(unknown.getStartNode().getOffset().longValue()
456 + stringEnd),
457 newType,
458 unknown.getFeatures()
459 );
460 } catch (InvalidOffsetException ex) {
461 throw new GateRuntimeException(ex.getMessage());
462 }
463 nameAllAnnots.remove(unknown);
464
465 List mList = (List)unknown.getFeatures().
467 get(ANNOTATION_COREF_FEATURE_NAME);
468 mList.remove(unknown.getId());
469 mList.add(newID);
470
471 }
472 return matched;
473 }
474
475 protected void matchWithPrevious(Annotation nameAnnot, String annotString) {
476 boolean matchedUnknown = false;
477
478 Iterator prevIter = processedAnnots.keySet().iterator();
479 while (prevIter.hasNext()) {
480 Integer prevId = (Integer) prevIter.next();
481 Annotation prevAnnot = nameAllAnnots.get(prevId);
482
483 if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
485 && ! nameAnnot.getType().equals(unknownType))
486 )
487 continue;
488 if ( nameAnnot.getType().equals(unknownType)
491 && prevAnnot.getType().equals(unknownType))
492 continue;
493
494 if (matchedAlready(nameAnnot, prevAnnot) )
496 continue;
497
498 if (prevAnnot.getType().equals(personType)) {
500 String prevGender =
501 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
502 String nameGender =
503 (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
504 if ( prevGender != null
505 && nameGender != null
506 && ( (nameGender.equalsIgnoreCase("female")
507 &&
508 prevGender.equalsIgnoreCase("male")
509 )
510 ||
511 (prevGender.equalsIgnoreCase("female")
512 && nameGender.equalsIgnoreCase("male")
513 )
514 )
515 ) continue;
518 }
520 if (matchAnnotations(nameAnnot, annotString, prevAnnot)) {
522 updateMatches(nameAnnot, prevAnnot);
524 if (nameAnnot.getType().equals(unknownType)) {
526 matchedUnknown = true;
527 if (prevAnnot.getType().equals(unknownType))
528 annots2Remove.put(nameAnnot.getId(),
529 annots2Remove.get(prevAnnot.getId()));
530 else
531 annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
532 nameAnnot.getFeatures().put("NMRule", unknownType);
534 } break; }
538 }
540 if (matchedUnknown)
541 processedAnnots.put(nameAnnot.getId(), annotString);
542
543
544 }
546 protected boolean matchAnnotations(Annotation newAnnot, String annotString,
547 Annotation prevAnnot) {
548 if (newAnnot.overlaps(prevAnnot))
550 return false;
551
552 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
555
556 String longName = prevAnnotString;
557 String shortName = annotString;
558 longAnnot = prevAnnot;
559 shortAnnot = newAnnot;
560
561 if (shortName.length()>longName.length()) {
562 String temp = longName;
563 longName = shortName;
564 shortName = temp;
565 Annotation tempAnn = longAnnot;
566 longAnnot = shortAnnot;
567 shortAnnot = tempAnn;
568 }
570 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
571 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
572
573 List matchesList = (List) prevAnnot.getFeatures().
574 get(ANNOTATION_COREF_FEATURE_NAME);
575 if (matchesList == null || matchesList.isEmpty())
576 return apply_rules_namematch(prevAnnot.getType(), shortName,longName);
577
578 if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) {
582
587 if (allMatchingNeeded) {
588 allMatchingNeeded = false;
589
590 List toMatchList = new ArrayList(matchesList);
591 toMatchList.remove(prevAnnot.getId());
594
595 return matchOtherAnnots(toMatchList, newAnnot, annotString);
596 } else
597 return true;
598 }
599 return false;
600 }
601
602
609 protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
610 String annotString) {
611
612 if (toMatchList.isEmpty())
614 return true;
615
616 boolean matchedAll = true;
617 int i = 0;
618
619 while (matchedAll && i < toMatchList.size()) {
620 Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
621
622 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
625 if (prevAnnotString == null)
626 try {
627 prevAnnotString = document.getContent().getContent(
628 prevAnnot.getStartNode().getOffset(),
629 prevAnnot.getEndNode().getOffset()
630 ).toString();
631 } catch (InvalidOffsetException ioe) {
632 return false;
633 }
635
636 String longName = prevAnnotString;
637 String shortName = annotString;
638 longAnnot = prevAnnot;
639 shortAnnot = newAnnot;
640
641 if (shortName.length()>=longName.length()) {
642 String temp = longName;
643 longName = shortName;
644 shortName = temp;
645 Annotation tempAnn = longAnnot;
646 longAnnot = shortAnnot;
647 shortAnnot = tempAnn;
648 }
650 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
651 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
652
653 matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName);
654
657 i++;
658 } return matchedAll;
660 }
661
662
663 protected boolean matchedAlready(Annotation annot1, Annotation annot2) {
664 List matchesList = (List) annot1.getFeatures().
667 get(ANNOTATION_COREF_FEATURE_NAME);
668 if ((matchesList == null) || matchesList.isEmpty())
669 return false;
670 else if (matchesList.contains(annot2.getId()))
671 return true;
672 return false;
673 }
674
675 protected Annotation updateMatches(Annotation newAnnot, String annotString) {
676 Annotation matchedAnnot = null;
677 Integer id;
678
679 Iterator iter = processedAnnots.keySet().iterator();
681 while (iter.hasNext()) {
682 id = (Integer) iter.next();
683 String oldString = (String) processedAnnots.get(id);
684 if (annotString.equals(oldString)) {
685 matchedAnnot = nameAllAnnots.get(id);
686 break;
687 } }
690 if (matchedAnnot == null) return null;
691 if (! matchedAnnot.getType().equals(newAnnot.getType())
694 && !newAnnot.getType().equals(unknownType) )
695 return matchedAnnot;
696
697 List matchesList = (List) matchedAnnot.getFeatures().
698 get(ANNOTATION_COREF_FEATURE_NAME);
699 if ((matchesList == null) || matchesList.isEmpty()) {
700 if (matchesList == null) {
702 matchesList = new ArrayList();
703 matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
704 matchesList);
705 matchesDocFeature.add(matchesList);
706 } matchesList.add(matchedAnnot.getId());
708 matchesList.add(newAnnot.getId());
709 } else {
710 matchesList.add(newAnnot.getId());
712 } newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
715 return matchedAnnot;
716 }
717
718 protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) {
719
720 List matchesList = (List) prevAnnot.getFeatures().
721 get(ANNOTATION_COREF_FEATURE_NAME);
722 if ((matchesList == null) || matchesList.isEmpty()) {
723 if (matchesList == null) {
725 matchesList = new ArrayList();
726 prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
727 matchesDocFeature.add(matchesList);
728 } matchesList.add(prevAnnot.getId());
730 matchesList.add(newAnnot.getId());
731 } else {
732 matchesList.add(newAnnot.getId());
734 } newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
737 if (prevAnnot.getType().equals(personType)) {
739 String prevGender =
740 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
741 String newGender =
742 (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
743 boolean unknownPrevGender = isUnknownGender(prevGender);
744 boolean unknownNewGender = isUnknownGender(newGender);
745 if (unknownPrevGender && !unknownNewGender)
746 prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender);
747 else if (unknownNewGender && !unknownPrevGender)
748 newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender);
749 } }
751
752
753 protected void docCleanup() {
754 Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
755 if (matchesValue != null && (matchesValue instanceof Map))
756 ((Map)matchesValue).remove(nameAllAnnots.getName());
757 else if (matchesValue != null) {
758 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap());
759 }
760
761 HashSet fNames = new HashSet();
763 fNames.add(ANNOTATION_COREF_FEATURE_NAME);
764 AnnotationSet annots =
765 nameAllAnnots.get(null, fNames);
766
767
769 if (annots == null || annots.isEmpty())
770 return;
771
772 Iterator iter = annots.iterator();
773 while (iter.hasNext()) {
774 while (iter.hasNext())
775 ((Annotation) iter.next()).getFeatures().
776 remove(ANNOTATION_COREF_FEATURE_NAME);
777 } }
780
781 protected String containTitle (String annotString, Annotation annot)
782 throws ExecutionException {
783 Long startAnnot = annot.getStartNode().getOffset();
785 Long endAnnot = annot.getEndNode().getOffset();
786
787 queryFM.clear();
789 queryFM.put("majorType", "title");
790 AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot);
791 if (as1 == null || as1.isEmpty())
792 return annotString;
793 AnnotationSet as =
794 as1.get("Lookup", queryFM);
795 if (as !=null && ! as.isEmpty()) {
796 List titles = new ArrayList((Set)as);
797 Collections.sort(titles, new gate.util.OffsetComparator());
798
799 Iterator iter = titles.iterator();
800 while (iter.hasNext()) {
801 Annotation titleAnn = (Annotation)(iter.next());
802
803 if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
807 return annotString;
808
809 try {
810 String annotTitle =
812 document.getContent().getContent(
813 titleAnn.getStartNode().getOffset(),
814 titleAnn.getEndNode().getOffset()
815 ).toString();
816
817 if (annotTitle.length()<annotString.length()) {
819 ((ArrayList) tokensMap.get(annot.getId())).remove(0);
825 return annotString.substring(
826 annotTitle.length()+1,annotString.length());
827 }
828 } catch (InvalidOffsetException ioe) {
829 throw new ExecutionException
830 ("Invalid offset of the annotation");
831 } } } return annotString;
835
836 }
837
838
839 protected String stripCDG (String annotString, Annotation annot){
840
841 ArrayList tokens = (ArrayList) tokensMap.get(annot.getId());
842
843 if ( ((String) ((Annotation) tokens.get(0)
845 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
846 .equalsIgnoreCase(THE_VALUE))
847 tokens.remove(0);
848
849 if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1)
851 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
852 tokens.remove(tokens.size()-1);
853
854 StringBuffer newString = new StringBuffer(50);
855 for (int i = 0; i < tokens.size(); i++){
856 newString.append((String) ((Annotation) tokens.get(i)
857 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
858 if (i != tokens.size()-1)
859 newString.append(" ");
860 }
861
863 if (caseSensitive)
864 return newString.toString();
865
866 return newString.toString().toLowerCase();
867 }
868
869
878
879
882
904
905 protected void createAnnotList(String nameFile,String nameList)
906 throws IOException{
907
908
914 URL fileURL = new URL(definitionFileURL, nameFile);
916 BufferedReader bufferedReader =
917 new BufferedReader(new InputStreamReader(fileURL.openStream(),
918 encoding));
919
920 String lineRead = null;
921 while ((lineRead = bufferedReader.readLine()) != null){
922 if (nameList.compareTo(CDGLISTNAME)==0){
923 if (caseSensitive)
924 cdg.add(lineRead);
925 else
926 cdg.add(lineRead.toLowerCase());
927 } else {
929 int index = lineRead.indexOf("£");
930 if (index != -1){
931 String expr = lineRead.substring(0,index);
932 if (!caseSensitive)
934 expr = expr.toLowerCase();
935 String code = lineRead.substring(index+1,lineRead.length());
936 if (nameList.equals(ALIASLISTNAME))
937 alias.put(expr, code);
938 else
939 if (nameList.equals(ARTLISTNAME))
940 def_art.put(expr, code);
941 else
942 if (nameList.equals(PREPLISTNAME))
943 prepos.put(expr, code);
944 else
945 if (nameList.equals(CONNECTORLISTNAME))
946 connector.put(expr, code);
947 else
948 if (nameList.equals(SPURLISTNAME))
949 spur_match.put(expr, code);
950
951 } }
954 } }
957
958
959 private boolean apply_rules_namematch(String annotationType, String shortName,
960 String longName) {
961 if (matchRule0(longName, shortName))
963 return false;
964 if (
965 ( matchRule2(longName, shortName)
969 ||
970 matchRule3(longName, shortName)
971 ) ||
973 ( ( annotationType.equals(organizationType)
975 || annotationType.equals("Facility"))
977 &&
978 ( matchRule4(longName, shortName)
979 ||
980 matchRule5(longName, shortName)
981 ||
982 matchRule6(longName, shortName)
983 ||
984 matchRule7(longName, shortName)
985 ||
986 matchRule9(longName, shortName)
989 ||
990 matchRule10(longName, shortName)
991 ||
992 matchRule11(longName, shortName)
993 ||
994 matchRule12(longName, shortName)
995 ||
996 matchRule13(shortName, longName)
997 )
998 ) ||
1000 ( ( annotationType.equals(personType))
1002 &&
1003 ( matchRule4(longName, shortName)
1004 ||
1005 matchRule5(longName, shortName)
1006 ||
1007 matchRule14(longName, shortName)
1008 || matchRule15(longName, shortName)
1011 )
1012 ) ) return true;
1015 return false;
1016 }
1018
1019
1020 public void setExtLists(Boolean newExtLists) {
1021 extLists = newExtLists.booleanValue();
1022 }
1024
1025 public void setCaseSensitive(Boolean newCase) {
1026 caseSensitive = newCase.booleanValue();
1027 }
1029
1030 public void setAnnotationSetName(String newAnnotationSetName) {
1031 annotationSetName = newAnnotationSetName;
1032 }
1034
1035 public void setAnnotationTypes(List newType) {
1036 annotationTypes = newType;
1037 }
1039
1040 public void setProcessUnknown(Boolean processOrNot) {
1041 this.matchingUnknowns = processOrNot.booleanValue();
1042 }
1044 public void setOrganizationType(String newOrganizationType) {
1045 organizationType = newOrganizationType;
1046 }
1048 public void setPersonType(String newPersonType) {
1049 personType = newPersonType;
1050 }
1052
1053 public String getAnnotationSetName() {
1054 return annotationSetName;
1055 }
1057
1058 public List getAnnotationTypes() {
1059 return annotationTypes;
1060 }
1062 public String getOrganizationType() {
1063 return organizationType;
1064 }
1065
1066 public String getPersonType() {
1067 return personType;
1068 }
1069
1070 public Boolean getExtLists() {
1071 return new Boolean(extLists);
1072 }
1073
1074
1075 public Boolean getCaseSensitive() {
1076 return new Boolean(caseSensitive);
1077 }
1078
1079
1080 public Boolean getProcessUnknown() {
1081 return new Boolean(matchingUnknowns);
1082 }
1083
1084
1089
1090 protected boolean isUnknownGender(String gender) {
1091 if (gender == null)
1092 return true;
1093 if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
1094 return false;
1095 return true;
1096
1097 }
1099
1104 public boolean matchRule0(String s1,
1105 String s2) {
1106 if (spur_match.containsKey(s1)
1107 && spur_match.containsKey(s2) )
1108 return
1109 spur_match.get(s1).toString().equals(spur_match.get(s2).toString());
1110
1111 return false;
1112 }
1114
1120 public boolean matchRule1(String s1,
1121 String s2,
1122 boolean matchCase) {
1123
1125 boolean matched = false;
1126 if (!matchCase)
1127 matched = s1.equalsIgnoreCase(s2);
1128 else matched = s1.equals(s2) ;
1129 return matched;
1133 }
1135
1136
1142 public boolean matchRule2(String s1,
1143 String s2) {
1144
1145 if (alias.containsKey(s1) && alias.containsKey(s2))
1146 return (alias.get(s1).toString().equals(alias.get(s2).toString()));
1147
1148 return false;
1149 }
1151
1159 public boolean matchRule3(String s1, String s2) {
1162 if (s2.endsWith("'s") || s2.endsWith("'")
1163 ||(s1.endsWith("'s")|| s1.endsWith("'"))) {
1164
1165
1166 String s2_poss = null;
1167
1168 if (!s2.endsWith("'s")) s2_poss = s2.concat("'s");
1169 else s2_poss = s2.concat("'");
1170
1171 if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true;
1172
1173 String token = (String)
1175 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1176
1177 if (!token.endsWith("'s")) s2_poss = token.concat("'s");
1178 else s2_poss = token.concat("'");
1179
1180 if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true;
1181
1182 } return false;
1184 }
1186
1193 public boolean matchRule4(String s1,
1194 String s2) {
1195
1196 boolean allTokensMatch = true;
1197
1198 Iterator tokensLongAnnotIter = tokensLongAnnot.iterator();
1199 Iterator tokensShortAnnotIter = tokensShortAnnot.iterator();
1200 while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) {
1201 Annotation token = (Annotation) tokensLongAnnotIter.next();
1202 if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE))
1203 continue;
1204 if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1206 ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) {
1207 allTokensMatch = false;
1208 break;
1209 } } return allTokensMatch;
1214 }
1216
1223 public boolean matchRule5(String s1,
1224 String s2) {
1225
1226 if (tokensLongAnnot.size()> 1 &&
1228 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number"))
1229 return false;
1230
1231
1235 if ( (shortAnnot.getType().equals(personType)
1239 || longAnnot.getType().equals(personType)
1240 )
1241 &&
1242 tokensShortAnnot.size()>1
1243 )
1244 return false;
1245
1246 if (tokensLongAnnot.size()<=1)
1247 return false;
1248 boolean result = matchRule1((String)
1249 ((Annotation) tokensLongAnnot.get(0)
1250 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME),
1251 s2,
1252 caseSensitive);
1253
1254 return result;
1257
1258 }
1260
1265 public boolean matchRule6(String s1,
1266 String s2) {
1267
1268 int i = 0;
1269
1270 if (s2.indexOf(" ") > 0)
1273 return false;
1274
1275 StringBuffer acronym_s1 = new StringBuffer("");
1277 StringBuffer acronymDot_s1 = new StringBuffer("");
1278
1279 for ( ;i < tokensLongAnnot.size(); i++ ) {
1280 String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i)
1281 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1);
1282 acronym_s1.append(toAppend);
1283 acronymDot_s1.append(toAppend);
1284 acronymDot_s1.append(".");
1285 }
1286
1287
1290 if (matchRule1(acronym_s1.toString(),s2,caseSensitive) ||
1291 matchRule1(acronymDot_s1.toString(),s2,caseSensitive) )
1292 return true;
1293
1294 return false;
1295 }
1297
1306 public boolean matchRule7(String s1,
1307 String s2) {
1308
1309 if (tokensShortAnnot.size() != 1)
1311 return false;
1312
1313 String previous_token = null;
1314
1315 for (int i = 0; i < tokensLongAnnot.size(); i++ ) {
1316 if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i)
1317 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) {
1318 previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1)
1319 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1320
1321 break;
1322 }
1323 }
1324
1325 if (previous_token != null) {
1327 return matchRule1(previous_token,s2,caseSensitive);
1330
1331 }
1332 return false;
1333 }
1335
1346 public boolean matchRule8(String s1,
1347 String s2) {
1348 Out.prln("OrthoMatcher warning: This rule has been discontinued!");
1349
1390 return false;
1391
1392 }
1394
1403 public boolean matchRule9(String s1,
1404 String s2) {
1405
1406 String s1_short = (String)
1409 ((Annotation) tokensLongAnnot.get(
1410 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1411 if (tokensLongAnnot.size()>1) {
1413 boolean matched = matchRule1(s1_short, s2, caseSensitive);
1414 if (matched)
1419 allMatchingNeeded = true;
1420 return matched;
1421 }
1423 return false;
1424 }
1426
1433 public boolean matchRule10(String s1,
1434 String s2) {
1435
1436 String token = null;
1437 String previous_token = null;
1438 String next_token = null;
1439 boolean invoke_rule=false;
1440
1441 if (tokensLongAnnot.size() >= 3
1442 && tokensShortAnnot.size() >= 2) {
1443
1444 int i = 0;
1446 for (; i< tokensLongAnnot.size(); i++) {
1447 token = (String)
1448 ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1449 if (prepos.containsKey(token)) {
1450 invoke_rule=true;
1451 break;
1452 } previous_token = token;
1454 }
1456 if (! invoke_rule)
1457 return false;
1458
1459 if (i < tokensLongAnnot.size()
1460 && previous_token != null)
1461 next_token= (String)
1462 ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1463 else return false;
1464
1465 String s21 = (String)
1466 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1467 String s22 = (String)
1468 ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1469 if (matchRule1(next_token,(String) s21,caseSensitive)
1471 && matchRule1(previous_token, s22,caseSensitive))
1472 return true ;
1473 } return false;
1475 }
1477
1485 public boolean matchRule11(String s1,
1486 String s2) {
1487
1488
1489
1491 String token11 = null;
1492 String token12 = null;
1493 String token21 = null;
1494 String token22 = null;
1495
1496 if (tokensLongAnnot.size() < 2)
1497 return false;
1498
1499 token11 = (String)
1501 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1502 token12 = (String)
1503 ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1504
1505 if (tokensShortAnnot.size() == 2) {
1507
1508 token21 = (String)
1509 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1510 token22 = (String)
1511 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1512
1513 if (token11.startsWith(token21)
1514 && token12.startsWith(token22))
1515 return true;
1516
1517 }
1519 else if (tokensShortAnnot.size()==1 && s2.length()>=3) {
1521
1522 for (int i=2;i<s2.length();i++) {
1525 token21=s2.substring(0,i+1);
1526 token22=s2.substring(i+1);
1527
1528 if (token11.startsWith(token21)
1529 && token12.startsWith(token22))
1530 return true;
1531 } }
1534 return false;
1535 }
1537
1543 public boolean matchRule12(String s1,
1544 String s2) {
1545
1546
1548 if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) {
1549
1551 String s1_first = (String)
1553 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1554 String s2_first = (String)
1555 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1556
1557 if (!matchRule1(s1_first,s2_first,caseSensitive))
1558 return false;
1559
1560 String s1_last = (String)
1561 ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1562 String s2_last = (String)
1563 ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1564
1565 return matchRule1(s1_last,s2_last,caseSensitive);
1566 } return false;
1568 }
1570
1583 public boolean matchRule13(String s1,
1584 String s2) {
1585
1586
1587 String token1 = null;
1588 String token2 = null;
1589
1590 int matched_tokens = 0, mismatches = 0;;
1591
1592 if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false;
1594
1595
1600 for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) {
1602
1603 if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1606 ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) {
1607 matched_tokens++;
1608 j++;
1609 } else
1610 mismatches++;
1611 }
1613 if (matched_tokens >= tokensLongAnnot.size()-1)
1614 return true;
1615
1616 return false;
1617 }
1619
1626 public boolean matchRule14(String s1,
1627 String s2) {
1628
1629 String s1_short = (String)
1632 ((Annotation) tokensLongAnnot.get(
1633 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1634 if (tokensLongAnnot.size()>1)
1636 return matchRule1(s1_short,
1637 s2,
1638 caseSensitive);
1639
1640 return false;
1641
1642 }
1644
1649 public boolean matchRule15(String s1,
1650 String s2) {
1651
1652 int matched_tokens = 0;
1653
1654
1656
1661 Annotation token1, token2;
1663 for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) {
1664 token1 = (Annotation) tokensShortAnnot.get(i);
1665 if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1667 continue;
1668
1669 for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) {
1670 token2 = (Annotation) tokensLongAnnot.get(j);
1672 if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1673 continue;
1674 if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1675 token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
1676 matched_tokens++;
1677 } }
1680 if (matched_tokens == tokensShortAnnot.size())
1686 return true;
1687
1688 return false;
1689 }
1691
1692
1695 private void buildTables(AnnotationSet nameAllAnnots) {
1696
1697 cdg.clear();
1699
1700 if (! extLists) {
1701 tempMap.clear();
1704 tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
1705 AnnotationSet nameAnnots =
1707 nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
1708
1709 if ((nameAnnots ==null) || nameAnnots.isEmpty())
1710 return;
1711
1712 Iterator iter = nameAnnots.iterator();
1713 while (iter.hasNext()) {
1714 Annotation annot = (Annotation)iter.next();
1715 Long offsetStartAnnot = annot.getStartNode().getOffset();
1717 Long offsetEndAnnot = annot.getEndNode().getOffset();
1718 try {
1719 gate.Document doc = nameAllAnnots.getDocument();
1720 String annotString =
1721 doc.getContent().getContent(
1722 offsetStartAnnot,offsetEndAnnot
1723 ).toString();
1724 cdg.add(annotString);
1725 } catch (InvalidOffsetException ioe) {
1726 ioe.printStackTrace(Err.getPrintWriter());
1727 }
1728 } } }
1732
1735 public String regularExpressions ( String text, String replacement,
1736 String regEx) {
1737 String result = text;
1738 try {
1739 RE re = new RE(regEx);
1740 result = re.substituteAll( text,replacement);
1741 } catch (REException ree) {ree.printStackTrace();}
1742 return result;
1743 }
1744
1745 public void setDefinitionFileURL(java.net.URL definitionFileURL) {
1746 this.definitionFileURL = definitionFileURL;
1747 }
1748
1749 public java.net.URL getDefinitionFileURL() {
1750 return definitionFileURL;
1751 }
1752 public void setEncoding(String encoding) {
1753 this.encoding = encoding;
1754 }
1755 public String getEncoding() {
1756 return encoding;
1757 }
1759
1760 private static class Class1 {
1761 }
1762}
1764