|
PronominalCoref |
|
1 /* 2 * PronominalCoref.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Marin Dimitrov, 30/Dec/2001 12 * 13 * $Id: PronominalCoref.java,v 1.28 2002/12/02 06:39:55 marin Exp $ 14 */ 15 16 package gate.creole.coref; 17 18 import java.util.*; 19 import java.net.*; 20 21 import junit.framework.*; 22 23 import gate.*; 24 import gate.creole.*; 25 import gate.util.*; 26 import gate.annotation.*; 27 28 public class PronominalCoref extends AbstractLanguageAnalyser 29 implements ProcessingResource, ANNIEConstants{ 30 31 public static final String COREF_DOCUMENT_PARAMETER_NAME = "document"; 32 33 public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName"; 34 35 /** --- */ 36 private static final boolean DEBUG = false; 37 38 //JAPE grammars 39 private static final String QT_GRAMMAR_URL = "gate://gate/creole/coref/quoted_text.jape"; 40 private static final String PLEON_GRAMMAR_URL = "gate://gate/creole/coref/pleonasm.jape"; 41 42 //annotation types 43 private static final String QUOTED_TEXT_TYPE = "QuotedText"; 44 private static final String PLEONASTIC_TYPE = "PleonasticIt"; 45 46 //annotation features 47 private static final String PRP_CATEGORY = "PRP"; 48 private static final String PRP$_CATEGORY = "PRP$"; 49 50 //scope 51 private static final int SENTENCES_IN_SCOPE = 3; 52 /** --- */ 53 private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR; 54 /** --- */ 55 private String annotationSetName; 56 /** --- */ 57 private Transducer qtTransducer; 58 /** --- */ 59 private Transducer pleonTransducer; 60 /** --- */ 61 private AnnotationSet defaultAnnotations; 62 /** --- */ 63 private Sentence[] textSentences; 64 /** --- */ 65 private Quote[] quotedText; 66 /** --- */ 67 private Annotation[] pleonasticIt; 68 /** --- */ 69 private HashMap personGender; 70 /** --- */ 71 private HashMap anaphor2antecedent; 72 /** --- */ 73 private static final FeatureMap PRP_RESTRICTION; 74 75 private boolean resolveIt = true; 76 77 /** --- */ 78 static { 79 ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator(); 80 PRP_RESTRICTION = new SimpleFeatureMapImpl(); 81 PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 82 } 83 84 /** --- */ 85 public PronominalCoref() { 86 87 this.personGender = new HashMap(); 88 this.anaphor2antecedent = new HashMap(); 89 this.qtTransducer = new gate.creole.Transducer(); 90 this.pleonTransducer = new gate.creole.Transducer(); 91 } 92 93 /** Initialise this resource, and return it. */ 94 public Resource init() throws ResourceInstantiationException { 95 96 //0. preconditions 97 Assert.assertNotNull(this.qtTransducer); 98 99 //1. initialise quoted text transducer 100 URL qtGrammarURL = null; 101 try { 102 qtGrammarURL = new URL(QT_GRAMMAR_URL); 103 } 104 catch(MalformedURLException mue) { 105 throw new ResourceInstantiationException(mue); 106 } 107 this.qtTransducer.setGrammarURL(qtGrammarURL); 108 this.qtTransducer.setEncoding("UTF-8"); 109 this.qtTransducer.init(); 110 111 //2. initialise pleonastic transducer 112 URL pleonGrammarURL = null; 113 try { 114 pleonGrammarURL = new URL(PLEON_GRAMMAR_URL); 115 } 116 catch(MalformedURLException mue) { 117 throw new ResourceInstantiationException(mue); 118 } 119 this.pleonTransducer.setGrammarURL(pleonGrammarURL); 120 this.pleonTransducer.setEncoding("UTF-8"); 121 this.pleonTransducer.init(); 122 123 124 //3. delegate 125 return super.init(); 126 } // init() 127 128 /** 129 * Reinitialises the processing resource. After calling this method the 130 * resource should be in the state it is after calling init. 131 * If the resource depends on external resources (such as rules files) then 132 * the resource will re-read those resources. If the data used to create 133 * the resource has changed since the resource has been created then the 134 * resource will change too after calling reInit(). 135 */ 136 public void reInit() throws ResourceInstantiationException { 137 138 if (null != this.qtTransducer) { 139 this.qtTransducer.reInit(); 140 } 141 142 if (null != this.pleonTransducer) { 143 this.pleonTransducer.reInit(); 144 } 145 146 init(); 147 } // reInit() 148 149 150 /** Set the document to run on. */ 151 public void setDocument(Document newDocument) { 152 153 //0. precondition 154 // Assert.assertNotNull(newDocument); 155 156 //1. set doc for aggregated components 157 this.qtTransducer.setDocument(newDocument); 158 this.pleonTransducer.setDocument(newDocument); 159 160 //3. delegate 161 super.setDocument(newDocument); 162 } 163 164 /** --- */ 165 public void setAnnotationSetName(String annotationSetName) { 166 this.annotationSetName = annotationSetName; 167 } 168 169 170 /** --- */ 171 public String getAnnotationSetName() { 172 return annotationSetName; 173 } 174 175 /** --- */ 176 public void setResolveIt(Boolean newValue) { 177 this.resolveIt = newValue.booleanValue(); 178 } 179 180 /** --- */ 181 public Boolean getResolveIt() { 182 return new Boolean(this.resolveIt); 183 } 184 185 186 /** 187 * This method runs the coreferencer. It assumes that all the needed parameters 188 * are set. If they are not, an exception will be fired. 189 */ 190 public void execute() throws ExecutionException{ 191 192 //0. preconditions 193 if(null == this.document) { 194 throw new ExecutionException("[coreference] Document is not set!"); 195 } 196 197 //1. preprocess 198 preprocess(); 199 /* 200 //2. remove corefs from previous run 201 String annSetName = this.annotationSetName == null ? "COREF" 202 : this.annotationSetName; 203 204 AnnotationSet corefSet = this.document.getAnnotations(annSetName); 205 if (false == corefSet.isEmpty()) { 206 corefSet.clear(); 207 } 208 */ 209 //3.get personal pronouns 210 FeatureMap constraintPRP = new SimpleFeatureMapImpl(); 211 constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 212 AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP); 213 214 //4.get possesive pronouns 215 FeatureMap constraintPRP$ = new SimpleFeatureMapImpl(); 216 constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY); 217 AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$); 218 219 //5.combine them 220 AnnotationSet pronouns = personalPronouns; 221 if (null == personalPronouns) { 222 pronouns = possesivePronouns; 223 } 224 else if (null != possesivePronouns) { 225 pronouns.addAll(possesivePronouns); 226 } 227 228 //6.do we have pronouns at all? 229 if (null == pronouns) { 230 //do nothing 231 return; 232 } 233 234 //7.sort them according to offset 235 Object[] arrPronouns = pronouns.toArray(); 236 java.util.Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR); 237 238 //8.cleanup - ease the GC 239 pronouns = personalPronouns = possesivePronouns = null; 240 241 int prnSentIndex = 0; 242 243 244 //10. process all pronouns 245 for (int i=0; i< arrPronouns.length; i++) { 246 Annotation currPronoun = (Annotation)arrPronouns[i]; 247 while (this.textSentences[prnSentIndex].getEndOffset().longValue() < 248 currPronoun.getEndNode().getOffset().longValue()) { 249 prnSentIndex++; 250 } 251 252 Sentence currSentence = this.textSentences[prnSentIndex]; 253 Assert.assertTrue(currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue()); 254 Assert.assertTrue(currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue()); 255 256 //11. find antecedent (if any) for pronoun 257 Annotation antc = findAntecedent(currPronoun,prnSentIndex); 258 259 //12. add to the ana2ant hashtable 260 this.anaphor2antecedent.put(currPronoun,antc); 261 } 262 263 //done 264 } 265 266 267 /** --- */ 268 public HashMap getResolvedAnaphora() { 269 return this.anaphor2antecedent; 270 } 271 272 /** --- */ 273 private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) { 274 275 //0. preconditions 276 Assert.assertNotNull(currPronoun); 277 Assert.assertTrue(prnSentIndex >= 0); 278 Assert.assertTrue(currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 279 Assert.assertTrue(currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 280 currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 281 282 //1. 283 String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 284 285 Assert.assertNotNull(strPronoun); 286 287 //2. delegate processing to the appropriate methods 288 if (strPronoun.equalsIgnoreCase("HE") || 289 strPronoun.equalsIgnoreCase("HIM") || 290 strPronoun.equalsIgnoreCase("HIS") || 291 strPronoun.equalsIgnoreCase("HIMSELF")) { 292 return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex); 293 } 294 else if (strPronoun.equalsIgnoreCase("SHE") || 295 strPronoun.equalsIgnoreCase("HER")) { 296 return _resolve$SHE$HER$(currPronoun,prnSentIndex); 297 } 298 else if (strPronoun.equalsIgnoreCase("IT") || 299 strPronoun.equalsIgnoreCase("ITS") || 300 strPronoun.equalsIgnoreCase("ITSELF")) { 301 return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex); 302 } 303 else if (strPronoun.equalsIgnoreCase("I") || 304 strPronoun.equalsIgnoreCase("ME") || 305 strPronoun.equalsIgnoreCase("MY") || 306 strPronoun.equalsIgnoreCase("MYSELF")) { 307 return _resolve$I$ME$MY$MYSELF$(currPronoun,prnSentIndex); 308 } 309 else { 310 if (DEBUG) { 311 gate.util.Err.println("["+strPronoun+"] is not handled yet..."); 312 } 313 return null; 314 } 315 } 316 317 318 boolean isPleonastic(Annotation pronoun) { 319 320 //0. preconditions 321 Assert.assertNotNull(pronoun); 322 String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 323 Assert.assertTrue(str.equalsIgnoreCase("IT")); 324 325 //1. do we have pleonasms in this text? 326 if (this.pleonasticIt.length == 0) { 327 return false; 328 } 329 330 //2. find closest pleonasm index 331 int closestPleonasmIndex = java.util.Arrays.binarySearch(this.pleonasticIt, 332 pronoun, 333 ANNOTATION_OFFSET_COMPARATOR); 334 //normalize index 335 if (closestPleonasmIndex < 0) { 336 closestPleonasmIndex = -closestPleonasmIndex -1 -1; 337 } 338 339 //still not good? 340 if (closestPleonasmIndex < 0) { 341 closestPleonasmIndex = 0; 342 } 343 344 //get closest pleonasm 345 Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex]; 346 347 //System.out.println(pleonasm); 348 //System.out.println(pronoun); 349 350 //3. return true only if the proboun is contained in pleonastic fragment 351 boolean result = (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue() 352 && 353 pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue()); 354 //System.out.println("is pleon=["+result+"]"); 355 return result; 356 } 357 358 359 /** --- */ 360 private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) { 361 362 //0. preconditions 363 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 364 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 365 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 366 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 367 Assert.assertTrue(pronounString.equalsIgnoreCase("HE") || 368 pronounString.equalsIgnoreCase("HIM") || 369 pronounString.equalsIgnoreCase("HIS") || 370 pronounString.equalsIgnoreCase("HIMSELF")); 371 372 //1. 373 boolean antecedentFound = false; 374 int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE; 375 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 376 377 int currSentenceIndex = sentenceIndex; 378 Annotation bestAntecedent = null; 379 380 while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) { 381 Sentence currSentence = this.textSentences[currSentenceIndex]; 382 AnnotationSet persons = currSentence.getPersons(); 383 384 Iterator it = persons.iterator(); 385 while (it.hasNext()) { 386 Annotation currPerson = (Annotation)it.next(); 387 String gender = (String)this.personGender.get(currPerson); 388 389 if (null == gender || 390 gender.equalsIgnoreCase("MALE") || 391 gender.equalsIgnoreCase("UNKNOWN")) { 392 //hit 393 antecedentFound = true; 394 395 if (null == bestAntecedent) { 396 bestAntecedent = currPerson; 397 } 398 else { 399 bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun); 400 } 401 } 402 } 403 404 if (0 == currSentenceIndex--) 405 break; 406 407 } 408 409 return bestAntecedent; 410 } 411 412 413 /** --- */ 414 private Annotation _resolve$SHE$HER$(Annotation pronoun, int sentenceIndex) { 415 416 //0. preconditions 417 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 418 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 419 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 420 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 421 Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") || 422 pronounString.equalsIgnoreCase("HER")); 423 424 //1. 425 boolean antecedentFound = false; 426 int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE; 427 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 428 int currSentenceIndex = sentenceIndex; 429 Annotation bestAntecedent = null; 430 431 while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) { 432 Sentence currSentence = this.textSentences[currSentenceIndex]; 433 AnnotationSet persons = currSentence.getPersons(); 434 435 Iterator it = persons.iterator(); 436 while (it.hasNext()) { 437 Annotation currPerson = (Annotation)it.next(); 438 String gender = (String)this.personGender.get(currPerson); 439 440 if (null == gender || 441 gender.equalsIgnoreCase("FEMALE") || 442 gender.equalsIgnoreCase("UNKNOWN")) { 443 //hit 444 antecedentFound = true; 445 446 if (null == bestAntecedent) { 447 bestAntecedent = currPerson; 448 } 449 else { 450 bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun); 451 } 452 } 453 } 454 455 if (0 == currSentenceIndex--) 456 break; 457 } 458 459 return bestAntecedent; 460 } 461 462 463 /** --- */ 464 private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) { 465 //do not resolve it pronouns if disabled by the user 466 if (! resolveIt) 467 return null; 468 469 //0. preconditions 470 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 471 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 472 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 473 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 474 Assert.assertTrue(pronounString.equalsIgnoreCase("IT") || 475 pronounString.equalsIgnoreCase("ITS") || 476 pronounString.equalsIgnoreCase("ITSELF")); 477 478 //0.5 check if the IT is pleonastic 479 if (pronounString.equalsIgnoreCase("IT") && 480 isPleonastic(pronoun)) { 481 //System.out.println("PLEONASM..."); 482 return null; 483 } 484 485 //1. 486 int scopeFirstIndex = sentenceIndex - 1; 487 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 488 489 int currSentenceIndex = sentenceIndex; 490 Annotation bestAntecedent = null; 491 492 while (currSentenceIndex >= scopeFirstIndex) { 493 494 Sentence currSentence = this.textSentences[currSentenceIndex]; 495 AnnotationSet org = currSentence.getOrganizations(); 496 AnnotationSet loc = currSentence.getLocations(); 497 //combine them 498 AnnotationSet org_loc = org; 499 org_loc.addAll(loc); 500 501 Iterator it = org_loc.iterator(); 502 while (it.hasNext()) { 503 Annotation currOrgLoc = (Annotation)it.next(); 504 505 if (null == bestAntecedent) { 506 //discard cataphoric references 507 if (currOrgLoc.getStartNode().getOffset().longValue() < 508 pronoun.getStartNode().getOffset().longValue()) { 509 bestAntecedent = currOrgLoc; 510 } 511 } 512 else { 513 bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun); 514 } 515 } 516 517 if (0 == currSentenceIndex--) 518 break; 519 } 520 521 return bestAntecedent; 522 } 523 524 525 /** --- */ 526 private Annotation _resolve$I$ME$MY$MYSELF$(Annotation pronoun, int sentenceIndex) { 527 528 //0. preconditions 529 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 530 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 531 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 532 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 533 Assert.assertTrue(pronounString.equalsIgnoreCase("I") || 534 pronounString.equalsIgnoreCase("MY") || 535 pronounString.equalsIgnoreCase("ME") || 536 pronounString.equalsIgnoreCase("MYSELF")); 537 538 //0.5 sanity check 539 //if there are not quotes at all in the text then exit 540 if (0 == this.quotedText.length) { 541 //System.out.println("TEXT WITH NO QUOTES ENCOUNTERED..."); 542 return null; 543 } 544 545 546 //1. 547 Annotation bestAntecedent = null; 548 549 int closestQuoteIndex = java.util.Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR); 550 //normalize index 551 if (closestQuoteIndex < 0) { 552 closestQuoteIndex = -closestQuoteIndex -1 -1; 553 } 554 555 //still not good? 556 if (closestQuoteIndex < 0) { 557 closestQuoteIndex = 0; 558 } 559 560 //get closest Quote 561 Quote quoteContext = this.quotedText[closestQuoteIndex]; 562 563 //assure that the pronoun is contained in the quoted text fragment 564 //otherwise exit 565 566 if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() || 567 pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) { 568 //oops, probably incorrect text - I/My/Me is not part of quoted text fragment 569 //exit 570 //System.out.println("Oops! ["+pronounString+"] not part of quoted fragment..."); 571 return null; 572 } 573 574 //get the Persons that precede/succeed the quoted fragment 575 //the order is: 576 // 577 //[1]. if there exists a Person or pronoun in {he, she} following the quoted fragment but 578 //in the same sentence, then use it 579 //i.e. ["PRN1(x)...", said X ...A, B, C ....] 580 // 581 //[2]. if there is a Person (NOT a pronoun) in the same sentence, 582 // preceding the quote, then use it 583 //i.e. . [A, B, C...X ..."PRN1(x) ..."...] 584 // 585 586 //try [1] 587 //get the succeeding Persons/pronouns 588 AnnotationSet succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER); 589 if (false == succCandidates.isEmpty()) { 590 //cool, we have candidates, pick up the one closest to the end quote 591 Iterator it = succCandidates.iterator(); 592 593 while (it.hasNext()) { 594 Annotation currCandidate = (Annotation)it.next(); 595 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) { 596 //wow, we have a candidate that is closer to the quote 597 bestAntecedent = currCandidate; 598 } 599 } 600 } 601 602 //try [2] 603 //get the preceding Persons/pronouns 604 if (null == bestAntecedent) { 605 AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE); 606 if (false == precCandidates.isEmpty()) { 607 //cool, we have candidates, pick up the one closest to the end quote 608 Iterator it = precCandidates.iterator(); 609 610 while (it.hasNext()) { 611 Annotation currCandidate = (Annotation)it.next(); 612 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) { 613 //wow, we have a candidate that is closer to the quote 614 bestAntecedent = currCandidate; 615 } 616 } 617 } 618 } 619 620 //try [3] 621 //get the Persons/pronouns back in context 622 if (null == bestAntecedent) { 623 AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK); 624 if (false == precCandidates.isEmpty()) { 625 //cool, we have candidates, pick up the one closest to the end quote 626 Iterator it = precCandidates.iterator(); 627 628 while (it.hasNext()) { 629 Annotation currCandidate = (Annotation)it.next(); 630 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) { 631 //wow, we have a candidate that is closer to the quote 632 bestAntecedent = currCandidate; 633 } 634 } 635 } 636 } 637 638 return bestAntecedent; 639 } 640 641 642 /** --- */ 643 private void preprocess() throws ExecutionException { 644 645 //0.5 cleanup 646 this.personGender.clear(); 647 this.anaphor2antecedent.clear(); 648 649 //1.get all annotation in the input set 650 if ( this.annotationSetName == null || this.annotationSetName.equals("")) { 651 this.defaultAnnotations = this.document.getAnnotations(); 652 } 653 else { 654 this.defaultAnnotations = this.document.getAnnotations(annotationSetName); 655 } 656 657 //if none found, print warning and exit 658 if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) { 659 Err.prln("Coref Warning: No annotations found for processing!"); 660 return; 661 } 662 663 664 665 //2.1 remove QT annotations if left from previous execution 666 AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE); 667 if (null != qtSet) { 668 qtSet.clear(); 669 } 670 671 //2.2. run quoted text transducer to generate "Quoted Text" annotations 672 this.qtTransducer.execute(); 673 674 //3.1 remove pleonastic annotations if left from previous execution 675 AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE); 676 if (null != pleonSet) { 677 pleonSet.clear(); 678 } 679 680 //3.2 run quoted text transducer to generate "Pleonasm" annotations 681 this.pleonTransducer.execute(); 682 683 //4.get all SENTENCE annotations 684 AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE); 685 686 this.textSentences = new Sentence[sentenceAnnotations.size()]; 687 Object[] sentenceArray = sentenceAnnotations.toArray(); 688 689 java.util.Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR); 690 691 for (int i=0; i< sentenceArray.length; i++) { 692 693 Annotation currSentence = (Annotation)sentenceArray[i]; 694 Long sentStartOffset = currSentence.getStartNode().getOffset(); 695 Long sentEndOffset = currSentence.getEndNode().getOffset(); 696 697 //4.1. get PERSOSNS in this sentence 698 AnnotationSet sentPersons = this.defaultAnnotations.get(PERSON_ANNOTATION_TYPE, 699 sentStartOffset, 700 sentEndOffset); 701 702 //4.2. get ORGANIZATIONS in this sentence 703 AnnotationSet sentOrgs = this.defaultAnnotations.get(ORGANIZATION_ANNOTATION_TYPE, 704 sentStartOffset, 705 sentEndOffset); 706 707 //4.3. get LOCATION in this sentence 708 AnnotationSet sentLocs = this.defaultAnnotations.get(LOCATION_ANNOTATION_TYPE, 709 sentStartOffset, 710 sentEndOffset); 711 712 //4.5. create a Sentence for thei SENTENCE annotation 713 this.textSentences[i] = new Sentence(i, 714 0, 715 sentStartOffset, 716 sentEndOffset, 717 sentPersons, 718 sentOrgs, 719 sentLocs 720 ); 721 722 //4.6. for all PERSONs in the sentence - find their gender using the 723 //orthographic coreferences if the gender of some entity is unknown 724 Iterator itPersons = sentPersons.iterator(); 725 while (itPersons.hasNext()) { 726 Annotation currPerson = (Annotation)itPersons.next(); 727 String gender = this.findPersonGender(currPerson); 728 this.personGender.put(currPerson,gender); 729 } 730 } 731 732 //5. initialise the quoted text fragments 733 AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE); 734 735 //if none then return 736 if (null == sentQuotes) { 737 this.quotedText = new Quote[0]; 738 } 739 else { 740 this.quotedText = new Quote[sentQuotes.size()]; 741 742 Object[] quotesArray = sentQuotes.toArray(); 743 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR); 744 745 for (int i =0; i < quotesArray.length; i++) { 746 this.quotedText[i] = new Quote((Annotation)quotesArray[i],i); 747 } 748 } 749 750 //6. initialuse the plonastic It annotations 751 AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE); 752 753 if (null == plaonasticSet) { 754 this.pleonasticIt = new Annotation[0]; 755 } 756 else { 757 this.pleonasticIt = new Annotation[plaonasticSet.size()]; 758 759 Object[] quotesArray = plaonasticSet.toArray(); 760 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR); 761 762 for (int i=0; i< this.pleonasticIt.length; i++) { 763 this.pleonasticIt[i] = (Annotation)quotesArray[i]; 764 } 765 } 766 767 } 768 769 770 /** --- */ 771 private String findPersonGender(Annotation person) { 772 773 String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 774 775 if (null==result) { 776 //gender is unknown - try to find it from the ortho coreferences 777 List orthoMatches = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME); 778 779 if (null != orthoMatches) { 780 Iterator itMatches = orthoMatches.iterator(); 781 782 while (itMatches.hasNext()) { 783 Integer correferringID = (Integer)itMatches.next(); 784 Annotation coreferringEntity = this.defaultAnnotations.get(correferringID); 785 Assert.assertTrue(coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE)); 786 String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 787 788 if (null != correferringGender) { 789 result = correferringGender; 790 break; 791 } 792 } 793 } 794 } 795 796 return result; 797 } 798 799 800 /** --- */ 801 private static class AnnotationOffsetComparator implements Comparator { 802 803 private int _getOffset(Object o) { 804 805 if (o instanceof Annotation) { 806 return ((Annotation)o).getEndNode().getOffset().intValue(); 807 } 808 else if (o instanceof Sentence) { 809 return ((Sentence)o).getStartOffset().intValue(); 810 } 811 else if (o instanceof Quote) { 812 return ((Quote)o).getStartOffset().intValue(); 813 } 814 else if (o instanceof Node) { 815 return ((Node)o).getOffset().intValue(); 816 } 817 else { 818 throw new IllegalArgumentException(); 819 } 820 } 821 822 public int compare(Object o1,Object o2) { 823 824 //0. preconditions 825 Assert.assertNotNull(o1); 826 Assert.assertNotNull(o2); 827 Assert.assertTrue(o1 instanceof Annotation || 828 o1 instanceof Sentence || 829 o1 instanceof Quote || 830 o1 instanceof Node); 831 Assert.assertTrue(o2 instanceof Annotation || 832 o2 instanceof Sentence || 833 o2 instanceof Quote || 834 o2 instanceof Node); 835 836 int offset1 = _getOffset(o1); 837 int offset2 = _getOffset(o2); 838 839 return offset1 - offset2; 840 } 841 } 842 843 844 /** --- */ 845 private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) { 846 847 //0. preconditions 848 Assert.assertNotNull(ant1); 849 Assert.assertNotNull(ant2); 850 Assert.assertNotNull(pronoun); 851 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 852 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 853 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 854 Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") || 855 pronounString.equalsIgnoreCase("HER") || 856 pronounString.equalsIgnoreCase("HE") || 857 pronounString.equalsIgnoreCase("HIM") || 858 pronounString.equalsIgnoreCase("HIS") || 859 pronounString.equalsIgnoreCase("HIMSELF")); 860 861 Long offset1 = ant1.getStartNode().getOffset(); 862 Long offset2 = ant2.getStartNode().getOffset(); 863 Long offsetPrn = pronoun.getStartNode().getOffset(); 864 865 long diff1 = offsetPrn.longValue() - offset1.longValue(); 866 long diff2 = offsetPrn.longValue() - offset2.longValue(); 867 // Assert.assertTrue(diff1 != 0 && diff2 != 0); 868 //reject candidates that overlap with the pronoun 869 if (diff1 == 0) { 870 return ant2; 871 } 872 else if (diff2 == 0) { 873 return ant1; 874 } 875 876 //get the one CLOSEST AND PRECEDING the pronoun 877 if (diff1 > 0 && diff2 > 0) { 878 //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B 879 if (diff1 < diff2) 880 return ant1; 881 else 882 return ant2; 883 } 884 else if (diff1 < 0 && diff2 < 0) { 885 //we have [...pronoun ...antecedentA...AntecedentB.......] ==> choose A 886 if (Math.abs(diff1) < Math.abs(diff2)) 887 return ant1; 888 else 889 return ant2; 890 } 891 else { 892 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 893 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 894 if (diff1 > 0) 895 return ant1; 896 else 897 return ant2; 898 } 899 } 900 901 /** --- */ 902 private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) { 903 904 //0. preconditions 905 Assert.assertNotNull(ant1); 906 Assert.assertNotNull(ant2); 907 Assert.assertNotNull(pronoun); 908 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 909 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 910 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 911 912 Assert.assertTrue(pronounString.equalsIgnoreCase("IT") || 913 pronounString.equalsIgnoreCase("ITS") || 914 pronounString.equalsIgnoreCase("ITSELF")); 915 916 Long offset1 = ant1.getStartNode().getOffset(); 917 Long offset2 = ant2.getStartNode().getOffset(); 918 Long offsetPrn = pronoun.getStartNode().getOffset(); 919 long diff1 = offsetPrn.longValue() - offset1.longValue(); 920 long diff2 = offsetPrn.longValue() - offset2.longValue(); 921 // Assert.assertTrue(diff1 != 0 && diff2 != 0); 922 //reject candidates that overlap with the pronoun 923 if (diff1 == 0) { 924 return ant2; 925 } 926 else if (diff2 == 0) { 927 return ant1; 928 } 929 930 931 //get the one CLOSEST AND PRECEDING the pronoun 932 if (diff1 > 0 && diff2 > 0) { 933 //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B 934 if (diff1 < diff2) 935 return ant1; 936 else 937 return ant2; 938 } 939 else if (diff1 > 0){ 940 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 941 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 942 return ant1; 943 } 944 else if (diff2 > 0){ 945 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 946 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 947 return ant2; 948 } 949 else { 950 //both possible antecedents are BEHIND the anaophoric pronoun - i.e. we have either 951 //cataphora, or nominal antecedent, or an antecedent that is further back in scope 952 //in any case - discard the antecedents 953 return null; 954 } 955 } 956 957 958 /** --- */ 959 private class Quote { 960 961 /** --- */ 962 public static final int ANTEC_AFTER = 1; 963 /** --- */ 964 public static final int ANTEC_BEFORE = 2; 965 /** --- */ 966 public static final int ANTEC_BACK = 3; 967 /** --- */ 968 private AnnotationSet antecedentsBefore; 969 /** --- */ 970 private AnnotationSet antecedentsAfter; 971 /** --- */ 972 private AnnotationSet antecedentsBackInContext; 973 /** --- */ 974 private Annotation quoteAnnotation; 975 /** --- */ 976 private int quoteIndex; 977 978 /** --- */ 979 public Quote(Annotation quoteAnnotation, int index) { 980 981 this.quoteAnnotation = quoteAnnotation; 982 this.quoteIndex = index; 983 init(); 984 } 985 986 /** --- */ 987 private void init() { 988 989 //0.preconditions 990 Assert.assertNotNull(textSentences); 991 992 //0.5 create a restriction for PRP pos tokens 993 FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl(); 994 prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 995 996 //1. generate the precPersons set 997 998 //1.1 locate the sentece containing the opening quote marks 999 int quoteStartPos = java.util.Arrays.binarySearch(textSentences, 1000 this.quoteAnnotation.getStartNode(), 1001 ANNOTATION_OFFSET_COMPARATOR); 1002 1003 //normalize index 1004 int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos 1005 : -quoteStartPos -1 -1; // blame Sun, not me 1006 //still not good? 1007 if (startSentenceIndex < 0) { 1008 startSentenceIndex = 0; 1009 } 1010 1011 //1.2. get the persons and restrict to these that precede the quote (i.e. not contained 1012 //in the quote) 1013 this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex, 1014 this.quoteIndex, 1015 ANTEC_BEFORE); 1016 1017 1018 //2. generate the precPersonsInCOntext set 1019 //2.1. get the persons from the sentence precedeing the sentence containing the quote start 1020 if (startSentenceIndex > 0) { 1021 this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1, 1022 this.quoteIndex, 1023 ANTEC_BACK); 1024 } 1025 1026 //2. generate the succ Persons set 1027 //2.1 locate the sentece containing the closing quote marks 1028 int quoteEndPos = java.util.Arrays.binarySearch(textSentences, 1029 this.quoteAnnotation.getEndNode(), 1030 ANNOTATION_OFFSET_COMPARATOR); 1031 1032 //normalize it 1033 int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos 1034 : -quoteEndPos -1 -1; // blame Sun, not me 1035 //still not good? 1036 if (endSentenceIndex < 0) { 1037 endSentenceIndex = 0; 1038 } 1039 1040 this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex, 1041 this.quoteIndex, 1042 ANTEC_AFTER); 1043 //generate t 1044 } 1045 1046 1047 /** --- */ 1048 private AnnotationSet generateAntecedentCandidates(int sentenceNumber, 1049 int quoteNumber , 1050 int mode) { 1051 1052 //0. preconditions 1053 Assert.assertTrue(sentenceNumber >=0); 1054 Assert.assertTrue(quoteNumber >=0); 1055 Assert.assertTrue(mode == Quote.ANTEC_AFTER || 1056 mode == Quote.ANTEC_BEFORE || 1057 mode == Quote.ANTEC_BACK); 1058 1059 //1. get sentence 1060 Sentence sentence = textSentences[sentenceNumber]; 1061 1062 //2. get the persons 1063 AnnotationSet antecedents = new AnnotationSetImpl(sentence.getPersons()); 1064 1065 //4. now get the he/she pronouns in the relevant context 1066 AnnotationSet annotations = null; 1067 1068 switch(mode) { 1069 1070 case ANTEC_BEFORE: 1071 annotations = defaultAnnotations.getContained(sentence.getStartOffset(), 1072 this.getStartOffset()); 1073 break; 1074 1075 case ANTEC_AFTER: 1076 annotations = defaultAnnotations.getContained(this.getEndOffset(), 1077 sentence.getEndOffset()); 1078 break; 1079 1080 case ANTEC_BACK: 1081 annotations = defaultAnnotations.getContained(sentence.getStartOffset(), 1082 sentence.getEndOffset()); 1083 break; 1084 } 1085 1086 //4. get the pronouns 1087 //restrict to he/she pronouns 1088 if (null != annotations) { 1089 AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION); 1090 1091 if (null != pronouns) { 1092 1093 Iterator it = pronouns.iterator(); 1094 while (it.hasNext()) { 1095 Annotation currPronoun = (Annotation)it.next(); 1096 //add to succPersons only if HE/SHE 1097 String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1098 1099 if (null != pronounString && 1100 (pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she")) 1101 ) 1102 antecedents.add(currPronoun); 1103 }//while 1104 }//if 1105 }//if 1106 1107 1108 //3. depending on the mode, may have to restrict persons to these that precede/succeed 1109 //the quoted fragment 1110 // 1111 //for ANTEC_BEFORE, get the ones #preceding# the quote, contained in the sentence where 1112 //the quote *starts* 1113 // 1114 //for ANTEC_AFTER, get the ones #succeeding# the quote, contained in the sentence where 1115 //the quote *ends* 1116 // 1117 //for ANTEC_BACK, we are operating in the context of the sentence previous to the 1118 //sentence where the quote starts. I.e. we're resolbinf a case like 1119 // [sss "q1q1q1q1" s1s1s1s1]["q2q2q2q2"] 1120 //...and we want to get the entities from the s1s1 part - they *succeed* the #previous# quote 1121 //Note that the cirrent sentence is the first one, not the second 1122 // 1123 Iterator itPersons = antecedents.iterator(); 1124 1125 while (itPersons.hasNext()) { 1126 Annotation currPerson = (Annotation)itPersons.next(); 1127 1128 //cut 1129 if (Quote.ANTEC_BEFORE == mode && 1130 currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) { 1131 //restrict only to persosn preceding 1132 itPersons.remove(); 1133 } 1134 else if (Quote.ANTEC_AFTER == mode && 1135 currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) { 1136 //restrict only to persons succeeding the quote 1137 itPersons.remove(); 1138 } 1139 else if (Quote.ANTEC_BACK == mode) { 1140 //this one is tricky 1141 //locate the quote previous to the one we're resolving 1142 //(since we're operating in the sentence previous to the quote being resolved 1143 //wew try to find if any quote (prevQuote) exist in this sentence and get the 1144 //persons succeeding it) 1145 1146 //get prev quote 1147 //is the curr quote the first one? 1148 if (quoteNumber >0) { 1149 Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1]; 1150 1151 //restrict to the succeeding persons 1152 if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) { 1153 itPersons.remove(); 1154 } 1155 } 1156 } 1157 } 1158 1159 return antecedents; 1160 } 1161 1162 /** --- */ 1163 public Long getStartOffset() { 1164 return this.quoteAnnotation.getStartNode().getOffset(); 1165 } 1166 1167 /** --- */ 1168 public Long getEndOffset() { 1169 return this.quoteAnnotation.getEndNode().getOffset(); 1170 } 1171 1172 /** --- */ 1173 public AnnotationSet getAntecedentCandidates(int type) { 1174 1175 switch(type) { 1176 1177 case ANTEC_AFTER: 1178 return this.antecedentsAfter; 1179 1180 case ANTEC_BEFORE: 1181 return this.antecedentsBefore; 1182 1183 case ANTEC_BACK: 1184 return this.antecedentsBackInContext; 1185 1186 default: 1187 throw new IllegalArgumentException(); 1188 } 1189 } 1190 1191 } 1192 1193 1194 /** --- */ 1195 private class Sentence { 1196 1197 /** --- */ 1198 private int sentNumber; 1199 /** --- */ 1200 private int paraNumber; 1201 /** --- */ 1202 private Long startOffset; 1203 /** --- */ 1204 private Long endOffset; 1205 /** --- */ 1206 private AnnotationSet persons; 1207 /** --- */ 1208 private AnnotationSet organizations; 1209 /** --- */ 1210 private AnnotationSet locations; 1211 1212 /** --- */ 1213 public Sentence(int sentNumber, 1214 int paraNumber, 1215 Long startOffset, 1216 Long endOffset, 1217 AnnotationSet persons, 1218 AnnotationSet organizations, 1219 AnnotationSet locations) { 1220 1221 this.sentNumber = sentNumber; 1222 this.paraNumber = paraNumber; 1223 this.startOffset = startOffset; 1224 this.endOffset = endOffset; 1225 this.persons = persons; 1226 this.organizations = organizations; 1227 this.locations = locations; 1228 } 1229 1230 /** --- */ 1231 public Long getStartOffset() { 1232 return this.startOffset; 1233 } 1234 1235 /** --- */ 1236 public Long getEndOffset() { 1237 return this.endOffset; 1238 } 1239 1240 /** --- */ 1241 public AnnotationSet getPersons() { 1242 return this.persons; 1243 } 1244 1245 /** --- */ 1246 public AnnotationSet getOrganizations() { 1247 return this.organizations; 1248 } 1249 1250 /** --- */ 1251 public AnnotationSet getLocations() { 1252 return this.locations; 1253 } 1254 } 1255 1256} 1257
|
PronominalCoref |
|