|
OrthoMatcher |
|
1 /* 2 * OrthoMatcher.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 24/August/2001 12 * 13 * $Id: OrthoMatcher.java,v 1.45 2002/10/30 14:18:46 valyt Exp $ 14 */ 15 16 17 package gate.creole.orthomatcher; 18 19 import gate.*; 20 import gate.util.*; 21 import gate.creole.*; 22 import gate.corpora.*; 23 import gate.annotation.*; 24 import java.util.*; 25 import java.io.*; 26 import java.net.*; 27 import gnu.regexp.*; 28 29 public class OrthoMatcher extends AbstractLanguageAnalyser 30 implements ANNIEConstants{ 31 32 public static final String 33 OM_DOCUMENT_PARAMETER_NAME = "document"; 34 35 public static final String 36 OM_ANN_SET_PARAMETER_NAME = "annotationSetName"; 37 38 public static final String 39 OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive"; 40 41 public static final String 42 OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes"; 43 44 public static final String 45 OM_ORG_TYPE_PARAMETER_NAME = "organizationType"; 46 47 public static final String 48 OM_PERSON_TYPE_PARAMETER_NAME = "personType"; 49 50 public static final String 51 OM_EXT_LISTS_PARAMETER_NAME = "extLists"; 52 53 protected static final String CDGLISTNAME = "cdg"; 54 protected static final String ALIASLISTNAME = "alias"; 55 protected static final String ARTLISTNAME = "def_art"; 56 protected static final String PREPLISTNAME = "prepos"; 57 protected static final String CONNECTORLISTNAME = "connector"; 58 protected static final String SPURLISTNAME = "spur_match"; 59 60 protected static final String PUNCTUATION_VALUE = "punctuation"; 61 protected static final String THE_VALUE = "The"; 62 63 64 /**the name of the annotation set*/ 65 protected String annotationSetName; 66 67 /** the types of the annotation */ 68 protected List annotationTypes = new ArrayList(10); 69 70 /** the organization type*/ 71 protected String organizationType = ORGANIZATION_ANNOTATION_TYPE; 72 73 /** the person type*/ 74 protected String personType = PERSON_ANNOTATION_TYPE; 75 76 protected String unknownType = "Unknown"; 77 78 /** internal or external list */ 79 protected boolean extLists = true; 80 81 /** matching unknowns or not*/ 82 protected boolean matchingUnknowns = true; 83 84 /** This is an internal variable to indicate whether 85 * we matched using a rule that requires that 86 * the newly matched annotation matches all the others 87 * This is needed, because organizations can share 88 * first/last tokens like News and be different 89 */ 90 private boolean allMatchingNeeded = false; 91 92 //** Orthomatching is not case-sensitive by default*/ 93 protected boolean caseSensitive = false; 94 95 protected FeatureMap queryFM = Factory.newFeatureMap(); 96 97 // protected ExecutionException executionException; 98 99 // name lookup tables (used for namematch) 100 //gave them bigger default size, coz rehash is expensive 101 protected HashMap alias = new HashMap(100); 102 protected HashSet cdg = new HashSet(50); 103 protected HashMap spur_match = new HashMap(100); 104 protected HashMap def_art = new HashMap(20); 105 protected HashMap connector = new HashMap(20); 106 protected HashMap prepos = new HashMap(30); 107 108 109 protected AnnotationSet nameAllAnnots = null; 110 protected HashMap processedAnnots = new HashMap(150); 111 protected HashMap annots2Remove = new HashMap(75); 112 protected List matchesDocFeature = new ArrayList(); 113 //maps annotation ids to array lists of tokens 114 protected HashMap tokensMap = new HashMap(150); 115 116 protected Annotation shortAnnot, longAnnot; 117 118 protected ArrayList tokensLongAnnot, tokensShortAnnot; 119 120 /** a feature map to be used when retrieving annotations 121 * declared here so can be reused for efficiency 122 * clear() before each use 123 */ 124 protected FeatureMap tempMap = Factory.newFeatureMap(); 125 126 /** the size of the buffer */ 127 private final static int BUFF_SIZE = 65000; 128 129 /** 130 * URL to the file containing the definition for this orthomatcher 131 */ 132 private java.net.URL definitionFileURL; 133 134 /** The encoding used for the definition file and associated lists.*/ 135 private String encoding; 136 137 /** @link dependency */ 138 /*#OrthoMatcher lnkOrthoMatcher;*/ 139 140 public OrthoMatcher () { 141 annotationTypes.add(organizationType); 142 annotationTypes.add(personType); 143 annotationTypes.add("Location"); 144 annotationTypes.add("Date"); 145 } 146 147 /** Initialise this resource, and return it. */ 148 public Resource init() throws ResourceInstantiationException { 149 //initialise the list of annotations which we will match 150 if(definitionFileURL == null){ 151 throw new ResourceInstantiationException( 152 "No URL provided for the definition file!"); 153 } 154 155 //at this point we have the definition file 156 try{ 157 BufferedReader reader = new BufferedReader( 158 new InputStreamReader(definitionFileURL.openStream(), 159 encoding)); 160 String lineRead = null; 161 while ((lineRead = reader.readLine()) != null){ 162 int index = lineRead.indexOf(":"); 163 if (index != -1){ 164 String nameFile = lineRead.substring(0,index); 165 String nameList = lineRead.substring(index+1,lineRead.length()); 166 createAnnotList(nameFile,nameList); 167 }// if 168 }//while 169 reader.close(); 170 }catch(IOException ioe){ 171 throw new ResourceInstantiationException(ioe); 172 } 173 174 return this; 175 } // init() 176 177 /** Run the resource. It doesn't make sense not to override 178 * this in subclasses so the default implementation signals an 179 * exception. 180 */ 181 public void execute() throws ExecutionException{ 182 183 //check the input 184 if(document == null) { 185 throw new ExecutionException( 186 "No document for namematch!" 187 ); 188 } 189 190 // get the annotations from document 191 if ((annotationSetName == null)|| (annotationSetName.equals(""))) 192 nameAllAnnots = document.getAnnotations(); 193 else 194 nameAllAnnots = document.getAnnotations(annotationSetName); 195 196 //if none found, print warning and exit 197 if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) { 198 Out.prln("OrthoMatcher Warning: No annotations found for processing"); 199 return; 200 } 201 202 //check if we've been run on this document before 203 //and clean the doc if needed 204 docCleanup(); 205 Map matchesMap = (Map)document.getFeatures(). 206 get(DOCUMENT_COREF_FEATURE_NAME); 207 208 // creates the cdg list from the document 209 //no need to create otherwise, coz already done in init() 210 if (!extLists) 211 buildTables(nameAllAnnots); 212 213 //first match all name annotations 214 matchNameAnnotations(); 215 216 //then match the unknown ones to all name ones 217 if (matchingUnknowns) 218 matchUnknown(); 219 220 // set the matches of the document 221 // determineMatchesDocument(); 222 if (! matchesDocFeature.isEmpty()) { 223 if(matchesMap == null){ 224 matchesMap = new HashMap(); 225 } 226 matchesMap.put(nameAllAnnots.getName(), matchesDocFeature); 227 //we need to put it even if it was already present in order to triger 228 //the update events 229 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap); 230 231 //cannot do clear() as this has already been put on the document 232 //so I need a new one for the next run of matcher 233 matchesDocFeature = new ArrayList(); 234 } 235 236 // Out.prln("Processed strings" + processedAnnots.values()); 237 //clean-up the internal data structures for next run 238 nameAllAnnots = null; 239 processedAnnots.clear(); 240 annots2Remove.clear(); 241 tokensMap.clear(); 242 matchesDocFeature = new ArrayList(); 243 longAnnot = null; 244 shortAnnot = null; 245 tokensLongAnnot = null; 246 tokensShortAnnot = null; 247 248 } // run() 249 250 protected void matchNameAnnotations() throws ExecutionException{ 251 // go through all the annotation types 252 Iterator iterAnnotationTypes = annotationTypes.iterator(); 253 while (iterAnnotationTypes.hasNext()) { 254 String annotationType = (String)iterAnnotationTypes.next(); 255 256 AnnotationSet nameAnnots = nameAllAnnots.get(annotationType); 257 258 // continue if no such annotations exist 259 if ((nameAnnots == null) || nameAnnots.isEmpty()) 260 continue; 261 262 Iterator iterNames = nameAnnots.iterator(); 263 while (iterNames.hasNext()) { 264 Annotation nameAnnot = (Annotation) iterNames.next(); 265 Integer id = nameAnnot.getId(); 266 267 // get string and value 268 String annotString = null; 269 try { 270 annotString = document.getContent().getContent( 271 nameAnnot.getStartNode().getOffset(), 272 nameAnnot.getEndNode().getOffset() 273 ).toString(); 274 // now do the reg. exp. substitutions 275 annotString = regularExpressions(annotString," ", "\\s+"); 276 277 } catch (InvalidOffsetException ioe) { 278 throw new ExecutionException 279 ("Invalid offset of the annotation"); 280 } 281 //convert to lower case if we are not doing a case sensitive match 282 if (!caseSensitive) 283 annotString = annotString.toLowerCase(); 284 285 //get the tokens 286 List tokens = new ArrayList((Set) 287 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE, 288 nameAnnot.getStartNode().getOffset(), 289 nameAnnot.getEndNode().getOffset() 290 )); 291 //if no tokens to match, do nothing 292 if (tokens.isEmpty()) 293 continue; 294 Collections.sort(tokens, new gate.util.OffsetComparator()); 295 //check if these actually do not end after the name 296 //needed coz new tokeniser conflates 297 //strings with dashes. So British Gas-style is two tokens 298 //instead of three. So cannot match properly British Gas 299 // tokens = checkTokens(tokens); 300 tokensMap.put(nameAnnot.getId(), tokens); 301 302 // Out.prln("Matching annot " + nameAnnot + ": string " + annotString); 303 304 //first check whether we have not matched such a string already 305 //if so, just consider it matched, don't bother calling the rules 306 if (processedAnnots.containsValue(annotString)) { 307 // Out.prln("Contained string found " + annotString); 308 updateMatches(nameAnnot, annotString); 309 processedAnnots.put(nameAnnot.getId(), annotString); 310 continue; 311 } else if (processedAnnots.isEmpty()) { 312 processedAnnots.put(nameAnnot.getId(), annotString); 313 continue; 314 } 315 316 //if a person, then remove their title before matching 317 if (nameAnnot.getType().equals(personType)) 318 annotString = containTitle(annotString, nameAnnot); 319 else if (nameAnnot.getType().equals(organizationType)) 320 annotString = stripCDG(annotString, nameAnnot); 321 322 if(null == annotString || "".equals(annotString)) 323 continue; 324 325 //otherwise try matching with previous annotations 326 matchWithPrevious(nameAnnot, annotString); 327 328 // Out.prln("Putting in previous " + nameAnnot + ": string " + annotString); 329 //finally add the current annotations to the processed map 330 processedAnnots.put(nameAnnot.getId(), annotString); 331 }//while through name annotations 332 333 }//while through annotation types 334 335 } 336 337 protected void matchUnknown() throws ExecutionException { 338 //get all Unknown annotations 339 AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType); 340 341 if ((unknownAnnots == null) || unknownAnnots.isEmpty()) 342 return; 343 344 Iterator iter = unknownAnnots.iterator(); 345 //loop through the unknown annots 346 while (iter.hasNext()) { 347 Annotation unknown = (Annotation) iter.next(); 348 349 // get string and value 350 String unknownString = null; 351 try { 352 unknownString = document.getContent().getContent( 353 unknown.getStartNode().getOffset(), 354 unknown.getEndNode().getOffset() 355 ).toString(); 356 // now do the reg. exp. substitutions 357 unknownString = regularExpressions(unknownString," ", "\\s+"); 358 } catch (InvalidOffsetException ioe) { 359 throw new ExecutionException 360 ("Invalid offset of the annotation"); 361 } 362 //convert to lower case if we are not doing a case sensitive match 363 if (!caseSensitive) 364 unknownString = unknownString.toLowerCase(); 365 366 //get the tokens 367 List tokens = new ArrayList((Set) 368 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE, 369 unknown.getStartNode().getOffset(), 370 unknown.getEndNode().getOffset() 371 )); 372 if (tokens.isEmpty()) 373 continue; 374 Collections.sort(tokens, new gate.util.OffsetComparator()); 375 tokensMap.put(unknown.getId(), tokens); 376 377 378 //first check whether we have not matched such a string already 379 //if so, just consider it matched, don't bother calling the rules 380 if (processedAnnots.containsValue(unknownString)) { 381 Annotation matchedAnnot = updateMatches(unknown, unknownString); 382 // Out.prln("Matched " + unknown + "with string " + unknownString); 383 // Out.prln("That's same as " + matchedAnnot); 384 if (matchedAnnot.getType().equals(unknownType)) { 385 annots2Remove.put(unknown.getId(), 386 annots2Remove.get(matchedAnnot.getId())); 387 } 388 else 389 annots2Remove.put(unknown.getId(), matchedAnnot.getType()); 390 processedAnnots.put(unknown.getId(), unknownString); 391 unknown.getFeatures().put("NMRule", unknownType); 392 continue; 393 } 394 395 //check if we should do sub-string matching in case it's hyphenated 396 //for example US-led 397 if (tokens.size() == 1 398 && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) { 399 if (matchHyphenatedUnknowns(unknown, unknownString, iter)) 400 continue; 401 }//if 402 403 matchWithPrevious(unknown, unknownString); 404 405 } //while though unknowns 406 407 if (! annots2Remove.isEmpty()) { 408 Iterator unknownIter = annots2Remove.keySet().iterator(); 409 while (unknownIter.hasNext()) { 410 Integer unknId = (Integer) unknownIter.next(); 411 Annotation unknown = nameAllAnnots.get(unknId); 412 Integer newID = nameAllAnnots.add( 413 unknown.getStartNode(), 414 unknown.getEndNode(), 415 (String) annots2Remove.get(unknId), 416 unknown.getFeatures() 417 ); 418 nameAllAnnots.remove(unknown); 419 420 //change the id in the matches list 421 List mList = (List)unknown.getFeatures(). 422 get(ANNOTATION_COREF_FEATURE_NAME); 423 mList.remove(unknId); 424 mList.add(newID); 425 }//while 426 }//if 427 } 428 429 private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString, 430 Iterator iter){ 431 boolean matched = false; 432 433 //only take the substring before the hyphen 434 int stringEnd = unknownString.indexOf("-"); 435 unknownString = unknownString.substring(0, stringEnd); 436 //check if we've already matched this string 437 //because only exact match of the substring are considered 438 if (processedAnnots.containsValue(unknownString)) { 439 matched = true; 440 Annotation matchedAnnot = updateMatches(unknown, unknownString); 441 //only do the matching if not a person, because we do not match 442 //those on sub-strings 443 iter.remove(); 444 String newType; 445 if (matchedAnnot.getType().equals(unknownType)) 446 newType = (String)annots2Remove.get(matchedAnnot.getId()); 447 else 448 newType = matchedAnnot.getType(); 449 450 Integer newID = new Integer(-1); 451 try { 452 newID = nameAllAnnots.add( 453 unknown.getStartNode().getOffset(), 454 new Long(unknown.getStartNode().getOffset().longValue() 455 + stringEnd), 456 newType, 457 unknown.getFeatures() 458 ); 459 } catch (InvalidOffsetException ex) { 460 throw new GateRuntimeException(ex.getMessage()); 461 } 462 nameAllAnnots.remove(unknown); 463 464 //change the id in the matches list 465 List mList = (List)unknown.getFeatures(). 466 get(ANNOTATION_COREF_FEATURE_NAME); 467 mList.remove(unknown.getId()); 468 mList.add(newID); 469 470 } 471 return matched; 472 } 473 474 protected void matchWithPrevious(Annotation nameAnnot, String annotString) { 475 boolean matchedUnknown = false; 476 477 Iterator prevIter = processedAnnots.keySet().iterator(); 478 while (prevIter.hasNext()) { 479 Integer prevId = (Integer) prevIter.next(); 480 Annotation prevAnnot = nameAllAnnots.get(prevId); 481 482 //check if the two are from the same type or the new one is unknown 483 if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType()) 484 && ! nameAnnot.getType().equals(unknownType)) 485 ) 486 continue; 487 //do not compare two unknown annotations either 488 //they are only matched to those of known types 489 if ( nameAnnot.getType().equals(unknownType) 490 && prevAnnot.getType().equals(unknownType)) 491 continue; 492 493 //check if we have already matched this annotation to the new one 494 if (matchedAlready(nameAnnot, prevAnnot) ) 495 continue; 496 497 //now changed to a rule, here we just match by gender 498 if (prevAnnot.getType().equals(personType)) { 499 String prevGender = 500 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 501 String nameGender = 502 (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 503 if ( prevGender != null 504 && nameGender != null 505 && ( (nameGender.equalsIgnoreCase("female") 506 && 507 prevGender.equalsIgnoreCase("male") 508 ) 509 || 510 (prevGender.equalsIgnoreCase("female") 511 && nameGender.equalsIgnoreCase("male") 512 ) 513 ) 514 ) //if condition 515 continue; //we don't have a match if the two genders are different 516 517 }//if 518 519 //if the two annotations match 520 if (matchAnnotations(nameAnnot, annotString, prevAnnot)) { 521 // Out.prln("Matched " + shortName + "and " + longName); 522 updateMatches(nameAnnot, prevAnnot); 523 //if unknown annotation, we need to change to the new type 524 if (nameAnnot.getType().equals(unknownType)) { 525 matchedUnknown = true; 526 if (prevAnnot.getType().equals(unknownType)) 527 annots2Remove.put(nameAnnot.getId(), 528 annots2Remove.get(prevAnnot.getId())); 529 else 530 annots2Remove.put(nameAnnot.getId(), prevAnnot.getType()); 531 //also put an attribute to indicate that 532 nameAnnot.getFeatures().put("NMRule", unknownType); 533 }//if unknown 534 break; //no need to match further 535 }//if annotations matched 536 537 }//while through previous annotations 538 539 if (matchedUnknown) 540 processedAnnots.put(nameAnnot.getId(), annotString); 541 542 543 }//matchWithPrevious 544 545 protected boolean matchAnnotations(Annotation newAnnot, String annotString, 546 Annotation prevAnnot) { 547 //do not match two annotations that overlap 548 if (newAnnot.overlaps(prevAnnot)) 549 return false; 550 551 // find which annotation string of the two is longer 552 // this is useful for some of the matching rules 553 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId()); 554 555 String longName = prevAnnotString; 556 String shortName = annotString; 557 longAnnot = prevAnnot; 558 shortAnnot = newAnnot; 559 560 if (shortName.length()>longName.length()) { 561 String temp = longName; 562 longName = shortName; 563 shortName = temp; 564 Annotation tempAnn = longAnnot; 565 longAnnot = shortAnnot; 566 shortAnnot = tempAnn; 567 }//if 568 569 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId()); 570 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId()); 571 572 List matchesList = (List) prevAnnot.getFeatures(). 573 get(ANNOTATION_COREF_FEATURE_NAME); 574 if (matchesList == null || matchesList.isEmpty()) 575 return apply_rules_namematch(prevAnnot.getType(), shortName,longName); 576 577 //if these two match, then let's see if all the other matching one will too 578 //that's needed, because sometimes names can share a token (e.g., first or 579 //last but not be the same 580 if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) { 581 /** 582 * Check whether we need to ensure that there is a match with the rest 583 * of the matching annotations, because the rule requires that 584 * transtivity is not assummed. 585 */ 586 if (allMatchingNeeded) { 587 allMatchingNeeded = false; 588 589 List toMatchList = new ArrayList(matchesList); 590 // if (newAnnot.getType().equals(unknownType)) 591 // Out.prln("Matching new " + annotString + " with annots " + toMatchList); 592 toMatchList.remove(prevAnnot.getId()); 593 594 return matchOtherAnnots(toMatchList, newAnnot, annotString); 595 } else 596 return true; 597 } 598 return false; 599 } 600 601 /** This method checkes whether the new annotation matches 602 * all annotations given in the toMatchList (it contains ids) 603 * The idea is that the new annotation needs to match all those, 604 * because assuming transitivity does not always work, when 605 * two different entities share a common token: e.g., BT Cellnet 606 * and BT and British Telecom. 607 */ 608 protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot, 609 String annotString) { 610 611 //if the list is empty, then we're matching all right :-) 612 if (toMatchList.isEmpty()) 613 return true; 614 615 boolean matchedAll = true; 616 int i = 0; 617 618 while (matchedAll && i < toMatchList.size()) { 619 Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i)); 620 621 // find which annotation string of the two is longer 622 // this is useful for some of the matching rules 623 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId()); 624 if (prevAnnotString == null) 625 try { 626 prevAnnotString = document.getContent().getContent( 627 prevAnnot.getStartNode().getOffset(), 628 prevAnnot.getEndNode().getOffset() 629 ).toString(); 630 } catch (InvalidOffsetException ioe) { 631 return false; 632 }//try 633 634 635 String longName = prevAnnotString; 636 String shortName = annotString; 637 longAnnot = prevAnnot; 638 shortAnnot = newAnnot; 639 640 if (shortName.length()>=longName.length()) { 641 String temp = longName; 642 longName = shortName; 643 shortName = temp; 644 Annotation tempAnn = longAnnot; 645 longAnnot = shortAnnot; 646 shortAnnot = tempAnn; 647 }//if 648 649 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId()); 650 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId()); 651 652 matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName); 653 // if (newAnnot.getType().equals(unknownType)) 654 // Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll); 655 656 i++; 657 }//while 658 return matchedAll; 659 } 660 661 662 protected boolean matchedAlready(Annotation annot1, Annotation annot2) { 663 //the two annotations are already matched if the matches list of the first 664 //contains the id of the second 665 List matchesList = (List) annot1.getFeatures(). 666 get(ANNOTATION_COREF_FEATURE_NAME); 667 if ((matchesList == null) || matchesList.isEmpty()) 668 return false; 669 else if (matchesList.contains(annot2.getId())) 670 return true; 671 return false; 672 } 673 674 protected Annotation updateMatches(Annotation newAnnot, String annotString) { 675 Annotation matchedAnnot = null; 676 Integer id; 677 678 //first find a processed annotation with the same string 679 Iterator iter = processedAnnots.keySet().iterator(); 680 while (iter.hasNext()) { 681 id = (Integer) iter.next(); 682 String oldString = (String) processedAnnots.get(id); 683 if (annotString.equals(oldString)) { 684 matchedAnnot = nameAllAnnots.get(id); 685 break; 686 }//if 687 }//while 688 689 if (matchedAnnot == null) return null; 690 //if the two matching annotations are of different type which is not 691 //unknown, do not match them 692 if (! matchedAnnot.getType().equals(newAnnot.getType()) 693 && !newAnnot.getType().equals(unknownType) ) 694 return matchedAnnot; 695 696 List matchesList = (List) matchedAnnot.getFeatures(). 697 get(ANNOTATION_COREF_FEATURE_NAME); 698 if ((matchesList == null) || matchesList.isEmpty()) { 699 //no previous matches, so need to add 700 if (matchesList == null) { 701 matchesList = new ArrayList(); 702 matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, 703 matchesList); 704 matchesDocFeature.add(matchesList); 705 }//if 706 matchesList.add(matchedAnnot.getId()); 707 matchesList.add(newAnnot.getId()); 708 } else { 709 //just add the new annotation 710 matchesList.add(newAnnot.getId()); 711 }//if 712 //add the matches list to the new annotation 713 newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 714 return matchedAnnot; 715 } 716 717 protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) { 718 719 List matchesList = (List) prevAnnot.getFeatures(). 720 get(ANNOTATION_COREF_FEATURE_NAME); 721 if ((matchesList == null) || matchesList.isEmpty()) { 722 //no previous matches, so need to add 723 if (matchesList == null) { 724 matchesList = new ArrayList(); 725 prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 726 matchesDocFeature.add(matchesList); 727 }//if 728 matchesList.add(prevAnnot.getId()); 729 matchesList.add(newAnnot.getId()); 730 } else { 731 //just add the new annotation 732 matchesList.add(newAnnot.getId()); 733 }//if 734 //add the matches list to the new annotation 735 newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 736 //propagate the gender if two persons are matched 737 if (prevAnnot.getType().equals(personType)) { 738 String prevGender = 739 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 740 String newGender = 741 (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 742 boolean unknownPrevGender = isUnknownGender(prevGender); 743 boolean unknownNewGender = isUnknownGender(newGender); 744 if (unknownPrevGender && !unknownNewGender) 745 prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender); 746 else if (unknownNewGender && !unknownPrevGender) 747 newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender); 748 }//if 749 } 750 751 752 protected void docCleanup() { 753 Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME); 754 if (matchesValue != null && (matchesValue instanceof Map)) 755 ((Map)matchesValue).remove(nameAllAnnots.getName()); 756 else if (matchesValue != null) { 757 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap()); 758 } 759 760 //get all annotations that have a matches feature 761 HashSet fNames = new HashSet(); 762 fNames.add(ANNOTATION_COREF_FEATURE_NAME); 763 AnnotationSet annots = 764 nameAllAnnots.get(null, fNames); 765 766 // Out.prln("Annots to cleanup" + annots); 767 768 if (annots == null || annots.isEmpty()) 769 return; 770 771 Iterator iter = annots.iterator(); 772 while (iter.hasNext()) { 773 while (iter.hasNext()) 774 ((Annotation) iter.next()).getFeatures(). 775 remove(ANNOTATION_COREF_FEATURE_NAME); 776 } //while 777 }//cleanup 778 779 /** return a person name without title */ 780 protected String containTitle (String annotString, Annotation annot) 781 throws ExecutionException { 782 // get the offsets 783 Long startAnnot = annot.getStartNode().getOffset(); 784 Long endAnnot = annot.getEndNode().getOffset(); 785 786 // determine "Lookup" annotation set 787 queryFM.clear(); 788 queryFM.put("majorType", "title"); 789 AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot); 790 if (as1 == null || as1.isEmpty()) 791 return annotString; 792 AnnotationSet as = 793 as1.get("Lookup", queryFM); 794 if (as !=null && ! as.isEmpty()) { 795 List titles = new ArrayList((Set)as); 796 Collections.sort(titles, new gate.util.OffsetComparator()); 797 798 Iterator iter = titles.iterator(); 799 while (iter.hasNext()) { 800 Annotation titleAnn = (Annotation)(iter.next()); 801 802 //we've not found a title at the start offset, 803 //there's no point in looking further 804 //coz titles come first 805 if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0) 806 return annotString; 807 808 try { 809 // the title from the current annotation 810 String annotTitle = 811 document.getContent().getContent( 812 titleAnn.getStartNode().getOffset(), 813 titleAnn.getEndNode().getOffset() 814 ).toString(); 815 816 // eliminate the title from annotation string and return the result 817 if (annotTitle.length()<annotString.length()) { 818 //remove from the array of tokens, so then we can compare properly 819 //the remaining tokens 820 // Out.prln("Removing title from: " + annot + " with string " + annotString); 821 // Out.prln("Tokens are" + tokensMap.get(annot.getId())); 822 // Out.prln("Title is" + annotTitle); 823 ((ArrayList) tokensMap.get(annot.getId())).remove(0); 824 return annotString.substring( 825 annotTitle.length()+1,annotString.length()); 826 } 827 } catch (InvalidOffsetException ioe) { 828 throw new ExecutionException 829 ("Invalid offset of the annotation"); 830 }//try 831 }// while 832 }//if 833 return annotString; 834 835 } 836 837 /** return an organization without a designator and starting The*/ 838 protected String stripCDG (String annotString, Annotation annot){ 839 840 ArrayList tokens = (ArrayList) tokensMap.get(annot.getId()); 841 842 //strip starting The first 843 if ( ((String) ((Annotation) tokens.get(0) 844 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) 845 .equalsIgnoreCase(THE_VALUE)) 846 tokens.remove(0); 847 848 //no need to check for cdg if there is only 1 token or less 849 if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1) 850 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) 851 tokens.remove(tokens.size()-1); 852 853 StringBuffer newString = new StringBuffer(50); 854 for (int i = 0; i < tokens.size(); i++){ 855 newString.append((String) ((Annotation) tokens.get(i) 856 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) ); 857 if (i != tokens.size()-1) 858 newString.append(" "); 859 } 860 // Out.prln("Strip CDG returned: " + newString + "for string " + annotString); 861 862 if (caseSensitive) 863 return newString.toString(); 864 865 return newString.toString().toLowerCase(); 866 } 867 868 /* 869 public void check() throws ExecutionException { 870 if (executionException != null) { 871 ExecutionException e = executionException; 872 executionException = null; 873 throw e; 874 } 875 } // check() 876 */ 877 878 /** if ( == false) then reads the names of files in order 879 * to create the lookup tables 880 */ 881 // protected void createLists() throws IOException { 882 // 883 // InputStream inputStream = Files.getGateResourceAsStream( 884 // "creole/namematcher/listsNM.def"); 885 // InputStreamReader inputStreamReader = new InputStreamReader ( 886 // inputStream); 887 // BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 888 // 889 // String lineRead = null; 890 // while ((lineRead = bufferedReader.readLine()) != null){ 891 // int index = lineRead.indexOf(":"); 892 // if (index != -1){ 893 // String nameFile = lineRead.substring(0,index); 894 // String nameList = lineRead.substring(index+1,lineRead.length()); 895 // createAnnotList(nameFile,nameList); 896 // }// if 897 // }//while 898 // bufferedReader.close(); 899 // inputStreamReader.close(); 900 // inputStream.close(); 901 // }// createLists() 902 903 /** creates the lookup tables */ 904 protected void createAnnotList(String nameFile,String nameList) 905 throws IOException{ 906 907 // InputStream inputStream = Files.getGateResourceAsStream( 908 // "creole/namematcher/"+nameFile); 909 // InputStreamReader inputStreamReader = new InputStreamReader ( 910 // inputStream); 911 // BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 912 913 //create the relative URL 914 URL fileURL = new URL(definitionFileURL, nameFile); 915 BufferedReader bufferedReader = 916 new BufferedReader(new InputStreamReader(fileURL.openStream(), 917 encoding)); 918 919 String lineRead = null; 920 while ((lineRead = bufferedReader.readLine()) != null){ 921 if (nameList.compareTo(CDGLISTNAME)==0){ 922 if (caseSensitive) 923 cdg.add(lineRead); 924 else 925 cdg.add(lineRead.toLowerCase()); 926 }// if 927 else { 928 int index = lineRead.indexOf("ý"); 929 if (index != -1){ 930 String expr = lineRead.substring(0,index); 931 //if not case-sensitive, we need to downcase all strings 932 if (!caseSensitive) 933 expr = expr.toLowerCase(); 934 String code = lineRead.substring(index+1,lineRead.length()); 935 if (nameList.equals(ALIASLISTNAME)) 936 alias.put(expr, code); 937 else 938 if (nameList.equals(ARTLISTNAME)) 939 def_art.put(expr, code); 940 else 941 if (nameList.equals(PREPLISTNAME)) 942 prepos.put(expr, code); 943 else 944 if (nameList.equals(CONNECTORLISTNAME)) 945 connector.put(expr, code); 946 else 947 if (nameList.equals(SPURLISTNAME)) 948 spur_match.put(expr, code); 949 950 }//if 951 }// else 952 953 }//while 954 }//createAnnotList 955 956 957 /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */ 958 private boolean apply_rules_namematch(String annotationType, String shortName, 959 String longName) { 960 // first apply rule for spurius matches i.e. rule0 961 if (matchRule0(longName, shortName)) 962 return false; 963 if ( 964 (// rules for all annotations 965 //no longer use rule1, coz I do the check for same string via the 966 //hash table 967 matchRule2(longName, shortName) 968 || 969 matchRule3(longName, shortName) 970 ) // rules for all annotations 971 || 972 (// rules for organisation annotations 973 ( annotationType.equals(organizationType) 974 //ACE addition 975 || annotationType.equals("Facility")) 976 && 977 ( matchRule4(longName, shortName) 978 || 979 matchRule5(longName, shortName) 980 || 981 matchRule6(longName, shortName) 982 || 983 matchRule7(longName, shortName) 984 || 985 // matchRule8(longName, shortName) 986 // || 987 matchRule9(longName, shortName) 988 || 989 matchRule10(longName, shortName) 990 || 991 matchRule11(longName, shortName) 992 || 993 matchRule12(longName, shortName) 994 || 995 matchRule13(shortName, longName) 996 ) 997 )// rules for organisation annotations 998 || 999 (// rules for person annotations 1000 ( annotationType.equals(personType)) 1001 && 1002 ( matchRule4(longName, shortName) 1003 || 1004 matchRule5(longName, shortName) 1005 || 1006 matchRule14(longName, shortName) 1007 || //kalina: added this, so it matches names when contain more 1008 //than one first and one last name 1009 matchRule15(longName, shortName) 1010 ) 1011 )// rules for person annotations 1012 ) //if 1013 return true; 1014 return false; 1015 }//apply_rules 1016 1017 1018 /** set the extLists flag */ 1019 public void setExtLists(Boolean newExtLists) { 1020 extLists = newExtLists.booleanValue(); 1021 }//setextLists 1022 1023 /** set the caseSensitive flag */ 1024 public void setCaseSensitive(Boolean newCase) { 1025 caseSensitive = newCase.booleanValue(); 1026 }//setextLists 1027 1028 /** set the annotation set name*/ 1029 public void setAnnotationSetName(String newAnnotationSetName) { 1030 annotationSetName = newAnnotationSetName; 1031 }//setAnnotationSetName 1032 1033 /** set the types of the annotations*/ 1034 public void setAnnotationTypes(List newType) { 1035 annotationTypes = newType; 1036 }//setAnnotationTypes 1037 1038 /** set whether to process the Unknown annotations*/ 1039 public void setProcessUnknown(Boolean processOrNot) { 1040 this.matchingUnknowns = processOrNot.booleanValue(); 1041 }//setAnnotationTypes 1042 1043 public void setOrganizationType(String newOrganizationType) { 1044 organizationType = newOrganizationType; 1045 }//setOrganizationType 1046 1047 public void setPersonType(String newPersonType) { 1048 personType = newPersonType; 1049 }//setPersonType 1050 1051 /**get the name of the annotation set*/ 1052 public String getAnnotationSetName() { 1053 return annotationSetName; 1054 }//getAnnotationSetName 1055 1056 /** get the types of the annotation*/ 1057 public List getAnnotationTypes() { 1058 return annotationTypes; 1059 }//getAnnotationTypes 1060 1061 public String getOrganizationType() { 1062 return organizationType; 1063 } 1064 1065 public String getPersonType() { 1066 return personType; 1067 } 1068 1069 public Boolean getExtLists() { 1070 return new Boolean(extLists); 1071 } 1072 1073 /** Are we running in a case-sensitive mode?*/ 1074 public Boolean getCaseSensitive() { 1075 return new Boolean(caseSensitive); 1076 } 1077 1078 /** Return whether or not we're processing the Unknown annots*/ 1079 public Boolean getProcessUnknown() { 1080 return new Boolean(matchingUnknowns); 1081 } 1082 1083/* 1084 public List getMatchesDocument() { 1085 return matchesDocument; 1086 } 1087*/ 1088 1089 protected boolean isUnknownGender(String gender) { 1090 if (gender == null) 1091 return true; 1092 if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female")) 1093 return false; 1094 return true; 1095 1096 } //isUnknownGender 1097 1098 /** RULE #0: If the two names are listed in table of 1099 * spurius matches then they do NOT match 1100 * Condition(s): - 1101 * Applied to: all name annotations 1102 */ 1103 public boolean matchRule0(String s1, 1104 String s2) { 1105 if (spur_match.containsKey(s1) 1106 && spur_match.containsKey(s2) ) 1107 return 1108 spur_match.get(s1).toString().equals(spur_match.get(s2).toString()); 1109 1110 return false; 1111 }//matchRule0 1112 1113 /** RULE #1: If the two names are identical then they are the same 1114 * no longer used, because I do the check for same string via the 1115 * hash table of previous annotations 1116 * Condition(s): depend on case 1117 * Applied to: all name annotations 1118 */ 1119 public boolean matchRule1(String s1, 1120 String s2, 1121 boolean matchCase) { 1122// Out.prln("Rule1: Matching " + s1 + "and " + s2); 1123 1124 boolean matched = false; 1125 if (!matchCase) 1126 matched = s1.equalsIgnoreCase(s2); 1127 else matched = s1.equals(s2) ; 1128//kalina: do not remove, nice for debug 1129// if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth"))) 1130// Out.prln("Rule1: Matched " + s1 + "and " + s2); 1131 return matched; 1132 }//matchRule1 1133 1134 1135 /** 1136 * RULE #2: if the two names are listed as equivalent in the 1137 * lookup table (alias) then they match 1138 * Condition(s): - 1139 * Applied to: all name annotations 1140 */ 1141 public boolean matchRule2(String s1, 1142 String s2) { 1143 1144 if (alias.containsKey(s1) && alias.containsKey(s2)) 1145 return (alias.get(s1).toString().equals(alias.get(s2).toString())); 1146 1147 return false; 1148 }//matchRule2 1149 1150 /** 1151 * RULE #3: adding a possessive at the end 1152 * of one name causes a match 1153 * e.g. "Standard and Poor" == "Standard and Poor's" 1154 * and also "Standard and Poor" == "Standard's" 1155 * Condition(s): case-insensitive match 1156 * Applied to: all name annotations 1157 */ 1158 public boolean matchRule3(String s1, //long string 1159 String s2) { //short string 1160 1161 if (s2.endsWith("'s") || s2.endsWith("'") 1162 ||(s1.endsWith("'s")|| s1.endsWith("'"))) { 1163 1164 1165 String s2_poss = null; 1166 1167 if (!s2.endsWith("'s")) s2_poss = s2.concat("'s"); 1168 else s2_poss = s2.concat("'"); 1169 1170 if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true; 1171 1172 // now check the second case i.e. "Standard and Poor" == "Standard's" 1173 String token = (String) 1174 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1175 1176 if (!token.endsWith("'s")) s2_poss = token.concat("'s"); 1177 else s2_poss = token.concat("'"); 1178 1179 if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true; 1180 1181 } // if (s2.endsWith("'s") 1182 return false; 1183 }//matchRule3 1184 1185 /** 1186 * RULE #4: Do all tokens other than the punctuation marks 1187 * , and . match? 1188 * e.g. "Smith, Jones" == "Smith Jones" 1189 * Condition(s): case-insensitive match 1190 * Applied to: organisation and person annotations 1191 */ 1192 public boolean matchRule4(String s1, 1193 String s2) { 1194 1195 boolean allTokensMatch = true; 1196 1197 Iterator tokensLongAnnotIter = tokensLongAnnot.iterator(); 1198 Iterator tokensShortAnnotIter = tokensShortAnnot.iterator(); 1199 while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) { 1200 Annotation token = (Annotation) tokensLongAnnotIter.next(); 1201 if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE)) 1202 continue; 1203// Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot); 1204 if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1205 ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) { 1206 allTokensMatch = false; 1207 break; 1208 } // if (!tokensLongAnnot.nextToken() 1209 } // while 1210// if (allTokensMatch) 1211// Out.prln("rule4 fired. result is: " + allTokensMatch); 1212 return allTokensMatch; 1213 }//matchRule4 1214 1215 /** 1216 * RULE #5: if the 1st token of one name 1217 * matches the second name 1218 * e.g. "Pepsi Cola" == "Pepsi" 1219 * Condition(s): case-insensitive match 1220 * Applied to: all name annotations 1221 */ 1222 public boolean matchRule5(String s1, 1223 String s2) { 1224 1225 //do not match numbers by this rule 1226 if (tokensLongAnnot.size()> 1 && 1227 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number")) 1228 return false; 1229 1230// if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) { 1231// Out.prln("Rule 5: " + s1 + "and " + s2); 1232// } 1233 1234 //require that when matching person names, the shorter one to be of length 1 1235 //for the rule to apply. In other words, avoid matching Peter Smith and 1236 //Peter Kline, because they share a Peter token. 1237 if ( (shortAnnot.getType().equals(personType) 1238 || longAnnot.getType().equals(personType) 1239 ) 1240 && 1241 tokensShortAnnot.size()>1 1242 ) 1243 return false; 1244 1245 if (tokensLongAnnot.size()<=1) 1246 return false; 1247 boolean result = matchRule1((String) 1248 ((Annotation) tokensLongAnnot.get(0) 1249 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME), 1250 s2, 1251 caseSensitive); 1252 1253// if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) 1254// Out.prln("rule 5 result: " + result); 1255 return result; 1256 1257 }//matchRule5 1258 1259 /** 1260 * RULE #6: if one name is the acronym of the other 1261 * e.g. "Imperial Chemical Industries" == "ICI" 1262 * Applied to: organisation annotations only 1263 */ 1264 public boolean matchRule6(String s1, 1265 String s2) { 1266 1267 int i = 0; 1268 1269 //check and if the shorted string has a space in it, then it's not 1270 //an acronym 1271 if (s2.indexOf(" ") > 0) 1272 return false; 1273 1274 //Out.prln("Acronym: Matching " + s1 + "and " + s2); 1275 StringBuffer acronym_s1 = new StringBuffer(""); 1276 StringBuffer acronymDot_s1 = new StringBuffer(""); 1277 1278 for ( ;i < tokensLongAnnot.size(); i++ ) { 1279 String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i) 1280 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1); 1281 acronym_s1.append(toAppend); 1282 acronymDot_s1.append(toAppend); 1283 acronymDot_s1.append("."); 1284 } 1285 1286 //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2); 1287 //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive)); 1288 1289 if (matchRule1(acronym_s1.toString(),s2,caseSensitive) || 1290 matchRule1(acronymDot_s1.toString(),s2,caseSensitive) ) 1291 return true; 1292 1293 return false; 1294 }//matchRule6 1295 1296 /** 1297 * RULE #7: if one of the tokens in one of the 1298 * names is in the list of separators eg. "&" 1299 * then check if the token before the separator 1300 * matches the other name 1301 * e.g. "R.H. Macy & Co." == "Macy" 1302 * Condition(s): case-sensitive match 1303 * Applied to: organisation annotations only 1304 */ 1305 public boolean matchRule7(String s1, 1306 String s2) { 1307 1308 //don't try it unless the second string is just one token 1309 if (tokensShortAnnot.size() != 1) 1310 return false; 1311 1312 String previous_token = null; 1313 1314 for (int i = 0; i < tokensLongAnnot.size(); i++ ) { 1315 if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i) 1316 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) { 1317 previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1) 1318 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1319 1320 break; 1321 } 1322 } 1323 1324 //now match previous_token with other name 1325 if (previous_token != null) { 1326// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1327// Out.prln("Rule7"); 1328 return matchRule1(previous_token,s2,caseSensitive); 1329 1330 } 1331 return false; 1332 }//matchRule7 1333 1334 /** 1335 * This rule is now obsolete, as The and the trailing CDG 1336 * are stripped before matching. 1337 * DO NOT CALL!!! 1338 * 1339 * RULE #8: if the names match, ignoring The and 1340 * and trailing company designator (which have already been stripped) 1341 * e.g. "The Magic Tricks Co." == "Magic Tricks" 1342 * Condition(s): case-sensitive match 1343 * Applied to: organisation annotations only 1344 */ 1345 public boolean matchRule8(String s1, 1346 String s2) { 1347 Out.prln("OrthoMatcher warning: This rule has been discontinued!"); 1348/* 1349 if (s1.startsWith("The ")) s1 = s1.substring(4); 1350 if (s2.startsWith("The ")) s2 = s2.substring(4); 1351 1352 // check that cdg is not empty 1353 if (!cdg.isEmpty()) { 1354 String stringToTokenize1 = s1; 1355 StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," "); 1356 1357 String stringToTokenize2 = s2; 1358 StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," "); 1359 String token = null; 1360 String cdg1 = null; 1361 String cdg2 = null; 1362 1363 s1 = ""; 1364 s2 = ""; 1365 1366 //check last token of s1 1367 while (tokensLongAnnot.hasMoreTokens()) { 1368 token = tokensLongAnnot.nextToken(); 1369 if (!tokensLongAnnot.hasMoreTokens() 1370 && cdg.contains(token)) cdg1=token; 1371 else s1 = s1+token; 1372 } 1373 1374 // do the same for s2 1375 while (tokensShortAnnot.hasMoreTokens()) { 1376 token = tokensShortAnnot.nextToken(); 1377 if (!tokensShortAnnot.hasMoreTokens() 1378 && cdg.contains(token)) cdg2=token; 1379 else s2 = s2+token; 1380 } 1381 1382 // if the company designators are different 1383 // then they are NOT the same organisations 1384 if ((cdg1!=null && cdg2!=null) 1385 && !cdg1.equalsIgnoreCase(cdg2)) return false; 1386 } 1387 if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive); 1388*/ 1389 return false; 1390 1391 }//matchRule8 1392 1393 /** 1394 * RULE #9: does one of the names match the token 1395 * just before a trailing company designator 1396 * in the other name? 1397 * The company designator has already been chopped off, 1398 * so the token before it, is in fact the last token 1399 * e.g. "R.H. Macy Co." == "Macy" 1400 * Applied to: organisation annotations only 1401 */ 1402 public boolean matchRule9(String s1, 1403 String s2) { 1404 1405// if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news")) 1406// Out.prln("Rule 9 " + s1 + " and " + s2); 1407 String s1_short = (String) 1408 ((Annotation) tokensLongAnnot.get( 1409 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1410// Out.prln("Converted to " + s1_short); 1411 if (tokensLongAnnot.size()>1) { 1412 boolean matched = matchRule1(s1_short, s2, caseSensitive); 1413 //we need to make sure all names match, instead of assuming transitivity, 1414 //to avoid matching BBC News with News then News with ITV News, which 1415 //by transitivity leads to BBC News matching ITV News which is not what 1416 //we want 1417 if (matched) 1418 allMatchingNeeded = true; 1419 return matched; 1420 } //if 1421 1422 return false; 1423 }//matchRule9 1424 1425 /** 1426 * RULE #10: is one name the reverse of the other 1427 * reversing around prepositions only? 1428 * e.g. "Department of Defence" == "Defence Department" 1429 * Condition(s): case-sensitive match 1430 * Applied to: organisation annotations only 1431 */ 1432 public boolean matchRule10(String s1, 1433 String s2) { 1434 1435 String token = null; 1436 String previous_token = null; 1437 String next_token = null; 1438 boolean invoke_rule=false; 1439 1440 if (tokensLongAnnot.size() >= 3 1441 && tokensShortAnnot.size() >= 2) { 1442 1443 // first get the tokens before and after the preposition 1444 int i = 0; 1445 for (; i< tokensLongAnnot.size(); i++) { 1446 token = (String) 1447 ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1448 if (prepos.containsKey(token)) { 1449 invoke_rule=true; 1450 break; 1451 }//if 1452 previous_token = token; 1453 }//while 1454 1455 if (! invoke_rule) 1456 return false; 1457 1458 if (i < tokensLongAnnot.size() 1459 && previous_token != null) 1460 next_token= (String) 1461 ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1462 else return false; 1463 1464 String s21 = (String) 1465 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1466 String s22 = (String) 1467 ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1468 // then compare (in reverse) with the first two tokens of s2 1469 if (matchRule1(next_token,(String) s21,caseSensitive) 1470 && matchRule1(previous_token, s22,caseSensitive)) 1471 return true ; 1472 }//if (tokensLongAnnot.countTokens() >= 3 1473 return false; 1474 }//matchRule10 1475 1476 /** 1477 * RULE #11: does one name consist of contractions 1478 * of the first two tokens of the other name? 1479 * e.g. "Communications Satellite" == "ComSat" 1480 * and "Pan American" == "Pan Am" 1481 * Condition(s): case-sensitive match 1482 * Applied to: organisation annotations only 1483 */ 1484 public boolean matchRule11(String s1, 1485 String s2) { 1486 1487 1488 // first do the easy case e.g. "Pan American" == "Pan Am" 1489 1490 String token11 = null; 1491 String token12 = null; 1492 String token21 = null; 1493 String token22 = null; 1494 1495 if (tokensLongAnnot.size() < 2) 1496 return false; 1497 1498 // 1st get the first two tokens of s1 1499 token11 = (String) 1500 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1501 token12 = (String) 1502 ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1503 1504 // now check for the first case i.e. "Pan American" == "Pan Am" 1505 if (tokensShortAnnot.size() == 2) { 1506 1507 token21 = (String) 1508 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1509 token22 = (String) 1510 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1511 1512 if (token11.startsWith(token21) 1513 && token12.startsWith(token22)) 1514 return true; 1515 1516 } // if (tokensShortAnnot.countTokens() == 2) 1517 1518 // now the second case e.g. "Communications Satellite" == "ComSat" 1519 else if (tokensShortAnnot.size()==1 && s2.length()>=3) { 1520 1521 // split the token into possible contractions 1522 // ignore case for matching 1523 for (int i=2;i<s2.length();i++) { 1524 token21=s2.substring(0,i+1); 1525 token22=s2.substring(i+1); 1526 1527 if (token11.startsWith(token21) 1528 && token12.startsWith(token22)) 1529 return true; 1530 }// for 1531 } // else if 1532 1533 return false; 1534 }//matchRule11 1535 1536 /** 1537 * RULE #12: do the first and last tokens of one name 1538 * match the first and last tokens of the other? 1539 * Condition(s): case-sensitive match 1540 * Applied to: organisation annotations only 1541 */ 1542 public boolean matchRule12(String s1, 1543 String s2) { 1544 1545 // first do the easy case e.g. "Pan American" == "Pan Am" 1546 1547 if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) { 1548// Out.prln("Rule 12"); 1549 1550 // get first and last tokens of s1 & s2 1551 String s1_first = (String) 1552 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1553 String s2_first = (String) 1554 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1555 1556 if (!matchRule1(s1_first,s2_first,caseSensitive)) 1557 return false; 1558 1559 String s1_last = (String) 1560 ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1561 String s2_last = (String) 1562 ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1563 1564 return matchRule1(s1_last,s2_last,caseSensitive); 1565 } // if (tokensLongAnnot.countTokens()>1 1566 return false; 1567 }//matchRule12 1568 1569 /** 1570 * RULE #13: do multi-word names match except for 1571 * one token e.g. 1572 * "Second Force Recon Company" == "Force Recon Company" 1573 * Note that this rule has NOT been used in LaSIE's 1.5 1574 * namematcher 1575 * Restrictions: - remove cdg first 1576 * - shortest name should be 2 words or more 1577 * - if N is the number of tokens of the longest 1578 * name, then N-1 tokens should be matched 1579 * Condition(s): case-sensitive match 1580 * Applied to: organisation or person annotations only 1581 */ 1582 public boolean matchRule13(String s1, 1583 String s2) { 1584 1585 1586 String token1 = null; 1587 String token2 = null; 1588 1589 int matched_tokens = 0, mismatches = 0;; 1590 1591 // if names < 2 words then rule is invalid 1592 if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false; 1593 1594// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) { 1595// Out.prln("Rule 13: Matching tokens" + tokensLongAnnot); 1596// Out.prln("with tokens " + tokensShortAnnot); 1597// } 1598 1599 // now do the matching 1600 for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) { 1601 1602// Out.prln("i = " + i); 1603// Out.prln("j = " + j); 1604 if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1605 ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) { 1606 matched_tokens++; 1607 j++; 1608 } else 1609 mismatches++; 1610 } // for 1611 1612 if (matched_tokens >= tokensLongAnnot.size()-1) 1613 return true; 1614 1615 return false; 1616 }//matchRule13 1617 1618 /** 1619 * RULE #14: if the last token of one name 1620 * matches the second name 1621 * e.g. "Hamish Cunningham" == "Cunningham" 1622 * Condition(s): case-insensitive match 1623 * Applied to: all person annotations 1624 */ 1625 public boolean matchRule14(String s1, 1626 String s2) { 1627 1628// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1629// Out.prln("Rule 14 " + s1 + " and " + s2); 1630 String s1_short = (String) 1631 ((Annotation) tokensLongAnnot.get( 1632 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1633// Out.prln("Converted to " + s1_short); 1634 if (tokensLongAnnot.size()>1) 1635 return matchRule1(s1_short, 1636 s2, 1637 caseSensitive); 1638 1639 return false; 1640 1641 }//matchRule14 1642 1643 /** 1644 * RULE #15: does one token from a Person name appear as the other token 1645 * Note that this rule has NOT been used in LaSIE's 1.5 1646 * namematcher; added for ACE by Di's request 1647 */ 1648 public boolean matchRule15(String s1, 1649 String s2) { 1650 1651 int matched_tokens = 0; 1652 1653 // if names < 2 words then rule is invalid 1654 1655// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) { 1656// Out.prln("Rule 15:" ); 1657// Out.prln("with tokens " + tokensShortAnnot); 1658// } 1659 1660 // now do the matching 1661 Annotation token1, token2; 1662 for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) { 1663 token1 = (Annotation) tokensShortAnnot.get(i); 1664 //first check if not punctuation, because we need to skip it 1665 if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE)) 1666 continue; 1667 1668 for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) { 1669// Out.prln("i = " + i); 1670 token2 = (Annotation) tokensLongAnnot.get(j); 1671 if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE)) 1672 continue; 1673 if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1674 token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) 1675 matched_tokens++; 1676 }//for 1677 } // for 1678 1679 //19 February 2002: kalina 1680 //was originally > 0 (i.e., any match is good) 1681 //ensure that we've matched all the tokens in the short annotation 1682 //the reason for that is, because otherwise we match 1683 //Patrick Viera and Patrick Somebody - not good! 1684 if (matched_tokens == tokensShortAnnot.size()) 1685 return true; 1686 1687 return false; 1688 }//matchRule15 1689 1690 1691 /** Tables for namematch info 1692 * (used by the namematch rules) 1693 */ 1694 private void buildTables(AnnotationSet nameAllAnnots) { 1695 1696 //reset the tables first 1697 cdg.clear(); 1698 1699 if (! extLists) { 1700 // i.e. get cdg from Lookup annotations 1701 // get all Lookup annotations 1702 tempMap.clear(); 1703 tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg"); 1704 //now get all lookup annotations which are cdg 1705 AnnotationSet nameAnnots = 1706 nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap); 1707 1708 if ((nameAnnots ==null) || nameAnnots.isEmpty()) 1709 return; 1710 1711 Iterator iter = nameAnnots.iterator(); 1712 while (iter.hasNext()) { 1713 Annotation annot = (Annotation)iter.next(); 1714 // get the actual string 1715 Long offsetStartAnnot = annot.getStartNode().getOffset(); 1716 Long offsetEndAnnot = annot.getEndNode().getOffset(); 1717 try { 1718 gate.Document doc = nameAllAnnots.getDocument(); 1719 String annotString = 1720 doc.getContent().getContent( 1721 offsetStartAnnot,offsetEndAnnot 1722 ).toString(); 1723 cdg.add(annotString); 1724 } catch (InvalidOffsetException ioe) { 1725 ioe.printStackTrace(Err.getPrintWriter()); 1726 } 1727 }// while 1728 }//if 1729 }//buildTables 1730 1731 /** substitute all multiple spaces, tabes and newlines 1732 * with a single space 1733 */ 1734 public String regularExpressions ( String text, String replacement, 1735 String regEx) { 1736 String result = text; 1737 try { 1738 RE re = new RE(regEx); 1739 result = re.substituteAll( text,replacement); 1740 } catch (REException ree) {ree.printStackTrace();} 1741 return result; 1742 } 1743 1744 public void setDefinitionFileURL(java.net.URL definitionFileURL) { 1745 this.definitionFileURL = definitionFileURL; 1746 } 1747 1748 public java.net.URL getDefinitionFileURL() { 1749 return definitionFileURL; 1750 } 1751 public void setEncoding(String encoding) { 1752 this.encoding = encoding; 1753 } 1754 public String getEncoding() { 1755 return encoding; 1756 }//regularExpressions 1757 1758 1759 private static class Class1 { 1760 } 1761} // public class OrthoMatcher 1762 1763
|
OrthoMatcher |
|