1
15
16 package gate.creole.coref;
17
18 import java.net.MalformedURLException;
19 import java.net.URL;
20 import java.util.*;
21
22 import junit.framework.Assert;
23
24 import gate.*;
25 import gate.annotation.AnnotationSetImpl;
26 import gate.creole.*;
27 import gate.util.*;
28 import gate.util.Err;
29 import gate.util.SimpleFeatureMapImpl;
30
31 public class PronominalCoref extends AbstractLanguageAnalyser
32 implements ProcessingResource, ANNIEConstants{
33
34 public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
35
36 public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
37
38
39 private static final boolean DEBUG = false;
40
41 private static final String QT_GRAMMAR_URL = Gate.class.
43 getResource(Files.getResourcePath() +
44 "/creole/coref/quoted_text.jape").toString();
45 private static final String PLEON_GRAMMAR_URL = Gate.class.getResource(
46 Files.getResourcePath() +
47 "/creole/coref/pleonasm.jape").toString();
48
49 private static final String QUOTED_TEXT_TYPE = "QuotedText";
51 private static final String PLEONASTIC_TYPE = "PleonasticIt";
52
53 private static final String PRP_CATEGORY = "PRP";
55 private static final String PRP$_CATEGORY = "PRP$";
56
57 private static final int SENTENCES_IN_SCOPE = 3;
59
60 private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
61
62 private String annotationSetName;
63
64 private Transducer qtTransducer;
65
66 private Transducer pleonTransducer;
67
68 private AnnotationSet defaultAnnotations;
69
70 private Sentence[] textSentences;
71
72 private Quote[] quotedText;
73
74 private Annotation[] pleonasticIt;
75
76 private HashMap personGender;
77
78 private HashMap anaphor2antecedent;
79
80 private static final FeatureMap PRP_RESTRICTION;
81
82 private boolean resolveIt = true;
83
84
85 static {
86 ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
87 PRP_RESTRICTION = new SimpleFeatureMapImpl();
88 PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
89 }
90
91
92 public PronominalCoref() {
93
94 this.personGender = new HashMap();
95 this.anaphor2antecedent = new HashMap();
96 this.qtTransducer = new gate.creole.Transducer();
97 this.pleonTransducer = new gate.creole.Transducer();
98 }
99
100
101 public Resource init() throws ResourceInstantiationException {
102
103 Assert.assertNotNull(this.qtTransducer);
105
106 URL qtGrammarURL = null;
108 try {
109 qtGrammarURL = new URL(QT_GRAMMAR_URL);
110 }
111 catch(MalformedURLException mue) {
112 throw new ResourceInstantiationException(mue);
113 }
114 this.qtTransducer.setGrammarURL(qtGrammarURL);
115 this.qtTransducer.setEncoding("UTF-8");
116 this.qtTransducer.init();
117
118 URL pleonGrammarURL = null;
120 try {
121 pleonGrammarURL = new URL(PLEON_GRAMMAR_URL);
122 }
123 catch(MalformedURLException mue) {
124 throw new ResourceInstantiationException(mue);
125 }
126 this.pleonTransducer.setGrammarURL(pleonGrammarURL);
127 this.pleonTransducer.setEncoding("UTF-8");
128 this.pleonTransducer.init();
129
130
131 return super.init();
133 }
135
143 public void reInit() throws ResourceInstantiationException {
144
145 if (null != this.qtTransducer) {
146 this.qtTransducer.reInit();
147 }
148
149 if (null != this.pleonTransducer) {
150 this.pleonTransducer.reInit();
151 }
152
153 init();
154 }
156
157
158 public void setDocument(Document newDocument) {
159
160
163 this.qtTransducer.setDocument(newDocument);
165 this.pleonTransducer.setDocument(newDocument);
166
167 super.setDocument(newDocument);
169 }
170
171
172 public void setAnnotationSetName(String annotationSetName) {
173 this.annotationSetName = annotationSetName;
174 }
175
176
177
178 public String getAnnotationSetName() {
179 return annotationSetName;
180 }
181
182
183 public void setResolveIt(Boolean newValue) {
184 this.resolveIt = newValue.booleanValue();
185 }
186
187
188 public Boolean getResolveIt() {
189 return new Boolean(this.resolveIt);
190 }
191
192
193
197 public void execute() throws ExecutionException{
198
199 if(null == this.document) {
201 throw new ExecutionException("[coreference] Document is not set!");
202 }
203
204 preprocess();
206
216 FeatureMap constraintPRP = new SimpleFeatureMapImpl();
218 constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
219 AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP);
220
221 FeatureMap constraintPRP$ = new SimpleFeatureMapImpl();
223 constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY);
224 AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$);
225
226 AnnotationSet pronouns = personalPronouns;
228 if (null == personalPronouns) {
229 pronouns = possesivePronouns;
230 }
231 else if (null != possesivePronouns) {
232 pronouns.addAll(possesivePronouns);
233 }
234
235 if (null == pronouns) {
237 return;
239 }
240
241 Object[] arrPronouns = pronouns.toArray();
243 java.util.Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR);
244
245 pronouns = personalPronouns = possesivePronouns = null;
247
248 int prnSentIndex = 0;
249
250
251 for (int i=0; i< arrPronouns.length; i++) {
253 Annotation currPronoun = (Annotation)arrPronouns[i];
254 while (this.textSentences[prnSentIndex].getEndOffset().longValue() <
255 currPronoun.getEndNode().getOffset().longValue()) {
256 prnSentIndex++;
257 }
258
259 Sentence currSentence = this.textSentences[prnSentIndex];
260 Assert.assertTrue(currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue());
261 Assert.assertTrue(currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue());
262
263 Annotation antc = findAntecedent(currPronoun,prnSentIndex);
265
266 this.anaphor2antecedent.put(currPronoun,antc);
268 }
269
270 }
272
273
274
275 public HashMap getResolvedAnaphora() {
276 return this.anaphor2antecedent;
277 }
278
279
280 private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) {
281
282 Assert.assertNotNull(currPronoun);
284 Assert.assertTrue(prnSentIndex >= 0);
285 Assert.assertTrue(currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
286 Assert.assertTrue(currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
287 currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
288
289 String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
291
292 Assert.assertNotNull(strPronoun);
293
294 if (strPronoun.equalsIgnoreCase("HE") ||
296 strPronoun.equalsIgnoreCase("HIM") ||
297 strPronoun.equalsIgnoreCase("HIS") ||
298 strPronoun.equalsIgnoreCase("HIMSELF")) {
299 return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex);
300 }
301 else if (strPronoun.equalsIgnoreCase("SHE") ||
302 strPronoun.equalsIgnoreCase("HER")) {
303 return _resolve$SHE$HER$(currPronoun,prnSentIndex);
304 }
305 else if (strPronoun.equalsIgnoreCase("IT") ||
306 strPronoun.equalsIgnoreCase("ITS") ||
307 strPronoun.equalsIgnoreCase("ITSELF")) {
308 return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex);
309 }
310 else if (strPronoun.equalsIgnoreCase("I") ||
311 strPronoun.equalsIgnoreCase("ME") ||
312 strPronoun.equalsIgnoreCase("MY") ||
313 strPronoun.equalsIgnoreCase("MYSELF")) {
314 return _resolve$I$ME$MY$MYSELF$(currPronoun,prnSentIndex);
315 }
316 else {
317 if (DEBUG) {
318 gate.util.Err.println("["+strPronoun+"] is not handled yet...");
319 }
320 return null;
321 }
322 }
323
324
325 boolean isPleonastic(Annotation pronoun) {
326
327 Assert.assertNotNull(pronoun);
329 String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
330 Assert.assertTrue(str.equalsIgnoreCase("IT"));
331
332 if (this.pleonasticIt.length == 0) {
334 return false;
335 }
336
337 int closestPleonasmIndex = java.util.Arrays.binarySearch(this.pleonasticIt,
339 pronoun,
340 ANNOTATION_OFFSET_COMPARATOR);
341 if (closestPleonasmIndex < 0) {
343 closestPleonasmIndex = -closestPleonasmIndex -1 -1;
344 }
345
346 if (closestPleonasmIndex < 0) {
348 closestPleonasmIndex = 0;
349 }
350
351 Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex];
353
354
357 boolean result = (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue()
359 &&
360 pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue());
361 return result;
363 }
364
365
366
367 private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) {
368
369 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
371 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
372 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
373 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
374 Assert.assertTrue(pronounString.equalsIgnoreCase("HE") ||
375 pronounString.equalsIgnoreCase("HIM") ||
376 pronounString.equalsIgnoreCase("HIS") ||
377 pronounString.equalsIgnoreCase("HIMSELF"));
378
379 boolean antecedentFound = false;
381 int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
382 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
383
384 int currSentenceIndex = sentenceIndex;
385 Annotation bestAntecedent = null;
386
387 while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
388 Sentence currSentence = this.textSentences[currSentenceIndex];
389 AnnotationSet persons = currSentence.getPersons();
390
391 Iterator it = persons.iterator();
392 while (it.hasNext()) {
393 Annotation currPerson = (Annotation)it.next();
394 String gender = (String)this.personGender.get(currPerson);
395
396 if (null == gender ||
397 gender.equalsIgnoreCase("MALE") ||
398 gender.equalsIgnoreCase("UNKNOWN")) {
399 antecedentFound = true;
401
402 if (null == bestAntecedent) {
403 bestAntecedent = currPerson;
404 }
405 else {
406 bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
407 }
408 }
409 }
410
411 if (0 == currSentenceIndex--)
412 break;
413
414 }
415
416 return bestAntecedent;
417 }
418
419
420
421 private Annotation _resolve$SHE$HER$(Annotation pronoun, int sentenceIndex) {
422
423 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
425 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
426 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
427 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
428 Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
429 pronounString.equalsIgnoreCase("HER"));
430
431 boolean antecedentFound = false;
433 int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
434 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
435 int currSentenceIndex = sentenceIndex;
436 Annotation bestAntecedent = null;
437
438 while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
439 Sentence currSentence = this.textSentences[currSentenceIndex];
440 AnnotationSet persons = currSentence.getPersons();
441
442 Iterator it = persons.iterator();
443 while (it.hasNext()) {
444 Annotation currPerson = (Annotation)it.next();
445 String gender = (String)this.personGender.get(currPerson);
446
447 if (null == gender ||
448 gender.equalsIgnoreCase("FEMALE") ||
449 gender.equalsIgnoreCase("UNKNOWN")) {
450 antecedentFound = true;
452
453 if (null == bestAntecedent) {
454 bestAntecedent = currPerson;
455 }
456 else {
457 bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
458 }
459 }
460 }
461
462 if (0 == currSentenceIndex--)
463 break;
464 }
465
466 return bestAntecedent;
467 }
468
469
470
471 private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) {
472 if (! resolveIt)
474 return null;
475
476 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
478 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
479 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
480 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
481 Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
482 pronounString.equalsIgnoreCase("ITS") ||
483 pronounString.equalsIgnoreCase("ITSELF"));
484
485 if (pronounString.equalsIgnoreCase("IT") &&
487 isPleonastic(pronoun)) {
488 return null;
490 }
491
492 int scopeFirstIndex = sentenceIndex - 1;
494 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
495
496 int currSentenceIndex = sentenceIndex;
497 Annotation bestAntecedent = null;
498
499 while (currSentenceIndex >= scopeFirstIndex) {
500
501 Sentence currSentence = this.textSentences[currSentenceIndex];
502 AnnotationSet org = currSentence.getOrganizations();
503 AnnotationSet loc = currSentence.getLocations();
504 AnnotationSet org_loc = org;
506 org_loc.addAll(loc);
507
508 Iterator it = org_loc.iterator();
509 while (it.hasNext()) {
510 Annotation currOrgLoc = (Annotation)it.next();
511
512 if (null == bestAntecedent) {
513 if (currOrgLoc.getStartNode().getOffset().longValue() <
515 pronoun.getStartNode().getOffset().longValue()) {
516 bestAntecedent = currOrgLoc;
517 }
518 }
519 else {
520 bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun);
521 }
522 }
523
524 if (0 == currSentenceIndex--)
525 break;
526 }
527
528 return bestAntecedent;
529 }
530
531
532
533 private Annotation _resolve$I$ME$MY$MYSELF$(Annotation pronoun, int sentenceIndex) {
534
535 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
537 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
538 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
539 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
540 Assert.assertTrue(pronounString.equalsIgnoreCase("I") ||
541 pronounString.equalsIgnoreCase("MY") ||
542 pronounString.equalsIgnoreCase("ME") ||
543 pronounString.equalsIgnoreCase("MYSELF"));
544
545 if (0 == this.quotedText.length) {
548 return null;
550 }
551
552
553 Annotation bestAntecedent = null;
555
556 int closestQuoteIndex = java.util.Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR);
557 if (closestQuoteIndex < 0) {
559 closestQuoteIndex = -closestQuoteIndex -1 -1;
560 }
561
562 if (closestQuoteIndex < 0) {
564 closestQuoteIndex = 0;
565 }
566
567 Quote quoteContext = this.quotedText[closestQuoteIndex];
569
570
573 if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() ||
574 pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) {
575 return null;
579 }
580
581
593 AnnotationSet succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER);
596 if (false == succCandidates.isEmpty()) {
597 Iterator it = succCandidates.iterator();
599
600 while (it.hasNext()) {
601 Annotation currCandidate = (Annotation)it.next();
602 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
603 bestAntecedent = currCandidate;
605 }
606 }
607 }
608
609 if (null == bestAntecedent) {
612 AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE);
613 if (false == precCandidates.isEmpty()) {
614 Iterator it = precCandidates.iterator();
616
617 while (it.hasNext()) {
618 Annotation currCandidate = (Annotation)it.next();
619 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) {
620 bestAntecedent = currCandidate;
622 }
623 }
624 }
625 }
626
627 if (null == bestAntecedent) {
630 AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK);
631 if (false == precCandidates.isEmpty()) {
632 Iterator it = precCandidates.iterator();
634
635 while (it.hasNext()) {
636 Annotation currCandidate = (Annotation)it.next();
637 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
638 bestAntecedent = currCandidate;
640 }
641 }
642 }
643 }
644
645 return bestAntecedent;
646 }
647
648
649
650 private void preprocess() throws ExecutionException {
651
652 this.personGender.clear();
654 this.anaphor2antecedent.clear();
655
656 if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
658 this.defaultAnnotations = this.document.getAnnotations();
659 }
660 else {
661 this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
662 }
663
664 if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
666 Err.prln("Coref Warning: No annotations found for processing!");
667 return;
668 }
669
670
671
672 AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
674 if (null != qtSet) {
675 qtSet.clear();
676 }
677
678 this.qtTransducer.execute();
680
681 AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
683 if (null != pleonSet) {
684 pleonSet.clear();
685 }
686
687 this.pleonTransducer.execute();
689
690 AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE);
692
693 this.textSentences = new Sentence[sentenceAnnotations.size()];
694 Object[] sentenceArray = sentenceAnnotations.toArray();
695
696 java.util.Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR);
697
698 for (int i=0; i< sentenceArray.length; i++) {
699
700 Annotation currSentence = (Annotation)sentenceArray[i];
701 Long sentStartOffset = currSentence.getStartNode().getOffset();
702 Long sentEndOffset = currSentence.getEndNode().getOffset();
703
704 AnnotationSet sentPersons = this.defaultAnnotations.get(PERSON_ANNOTATION_TYPE,
706 sentStartOffset,
707 sentEndOffset);
708
709 AnnotationSet sentOrgs = this.defaultAnnotations.get(ORGANIZATION_ANNOTATION_TYPE,
711 sentStartOffset,
712 sentEndOffset);
713
714 AnnotationSet sentLocs = this.defaultAnnotations.get(LOCATION_ANNOTATION_TYPE,
716 sentStartOffset,
717 sentEndOffset);
718
719 this.textSentences[i] = new Sentence(i,
721 0,
722 sentStartOffset,
723 sentEndOffset,
724 sentPersons,
725 sentOrgs,
726 sentLocs
727 );
728
729 Iterator itPersons = sentPersons.iterator();
732 while (itPersons.hasNext()) {
733 Annotation currPerson = (Annotation)itPersons.next();
734 String gender = this.findPersonGender(currPerson);
735 this.personGender.put(currPerson,gender);
736 }
737 }
738
739 AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
741
742 if (null == sentQuotes) {
744 this.quotedText = new Quote[0];
745 }
746 else {
747 this.quotedText = new Quote[sentQuotes.size()];
748
749 Object[] quotesArray = sentQuotes.toArray();
750 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
751
752 for (int i =0; i < quotesArray.length; i++) {
753 this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
754 }
755 }
756
757 AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
759
760 if (null == plaonasticSet) {
761 this.pleonasticIt = new Annotation[0];
762 }
763 else {
764 this.pleonasticIt = new Annotation[plaonasticSet.size()];
765
766 Object[] quotesArray = plaonasticSet.toArray();
767 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
768
769 for (int i=0; i< this.pleonasticIt.length; i++) {
770 this.pleonasticIt[i] = (Annotation)quotesArray[i];
771 }
772 }
773
774 }
775
776
777
778 private String findPersonGender(Annotation person) {
779
780 String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
781
782 if (null==result) {
783 List orthoMatches = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
785
786 if (null != orthoMatches) {
787 Iterator itMatches = orthoMatches.iterator();
788
789 while (itMatches.hasNext()) {
790 Integer correferringID = (Integer)itMatches.next();
791 Annotation coreferringEntity = this.defaultAnnotations.get(correferringID);
792 Assert.assertTrue(coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE));
793 String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
794
795 if (null != correferringGender) {
796 result = correferringGender;
797 break;
798 }
799 }
800 }
801 }
802
803 return result;
804 }
805
806
807
808 private static class AnnotationOffsetComparator implements Comparator {
809
810 private int _getOffset(Object o) {
811
812 if (o instanceof Annotation) {
813 return ((Annotation)o).getEndNode().getOffset().intValue();
814 }
815 else if (o instanceof Sentence) {
816 return ((Sentence)o).getStartOffset().intValue();
817 }
818 else if (o instanceof Quote) {
819 return ((Quote)o).getStartOffset().intValue();
820 }
821 else if (o instanceof Node) {
822 return ((Node)o).getOffset().intValue();
823 }
824 else {
825 throw new IllegalArgumentException();
826 }
827 }
828
829 public int compare(Object o1,Object o2) {
830
831 Assert.assertNotNull(o1);
833 Assert.assertNotNull(o2);
834 Assert.assertTrue(o1 instanceof Annotation ||
835 o1 instanceof Sentence ||
836 o1 instanceof Quote ||
837 o1 instanceof Node);
838 Assert.assertTrue(o2 instanceof Annotation ||
839 o2 instanceof Sentence ||
840 o2 instanceof Quote ||
841 o2 instanceof Node);
842
843 int offset1 = _getOffset(o1);
844 int offset2 = _getOffset(o2);
845
846 return offset1 - offset2;
847 }
848 }
849
850
851
852 private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
853
854 Assert.assertNotNull(ant1);
856 Assert.assertNotNull(ant2);
857 Assert.assertNotNull(pronoun);
858 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
859 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
860 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
861 Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
862 pronounString.equalsIgnoreCase("HER") ||
863 pronounString.equalsIgnoreCase("HE") ||
864 pronounString.equalsIgnoreCase("HIM") ||
865 pronounString.equalsIgnoreCase("HIS") ||
866 pronounString.equalsIgnoreCase("HIMSELF"));
867
868 Long offset1 = ant1.getStartNode().getOffset();
869 Long offset2 = ant2.getStartNode().getOffset();
870 Long offsetPrn = pronoun.getStartNode().getOffset();
871
872 long diff1 = offsetPrn.longValue() - offset1.longValue();
873 long diff2 = offsetPrn.longValue() - offset2.longValue();
874 if (diff1 == 0) {
877 return ant2;
878 }
879 else if (diff2 == 0) {
880 return ant1;
881 }
882
883 if (diff1 > 0 && diff2 > 0) {
885 if (diff1 < diff2)
887 return ant1;
888 else
889 return ant2;
890 }
891 else if (diff1 < 0 && diff2 < 0) {
892 if (Math.abs(diff1) < Math.abs(diff2))
894 return ant1;
895 else
896 return ant2;
897 }
898 else {
899 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
900 if (diff1 > 0)
902 return ant1;
903 else
904 return ant2;
905 }
906 }
907
908
909 private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
910
911 Assert.assertNotNull(ant1);
913 Assert.assertNotNull(ant2);
914 Assert.assertNotNull(pronoun);
915 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
916 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
917 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
918
919 Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
920 pronounString.equalsIgnoreCase("ITS") ||
921 pronounString.equalsIgnoreCase("ITSELF"));
922
923 Long offset1 = ant1.getStartNode().getOffset();
924 Long offset2 = ant2.getStartNode().getOffset();
925 Long offsetPrn = pronoun.getStartNode().getOffset();
926 long diff1 = offsetPrn.longValue() - offset1.longValue();
927 long diff2 = offsetPrn.longValue() - offset2.longValue();
928 if (diff1 == 0) {
931 return ant2;
932 }
933 else if (diff2 == 0) {
934 return ant1;
935 }
936
937
938 if (diff1 > 0 && diff2 > 0) {
940 if (diff1 < diff2)
942 return ant1;
943 else
944 return ant2;
945 }
946 else if (diff1 > 0){
947 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
948 return ant1;
950 }
951 else if (diff2 > 0){
952 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
953 return ant2;
955 }
956 else {
957 return null;
961 }
962 }
963
964
965
966 private class Quote {
967
968
969 public static final int ANTEC_AFTER = 1;
970
971 public static final int ANTEC_BEFORE = 2;
972
973 public static final int ANTEC_BACK = 3;
974
975 private AnnotationSet antecedentsBefore;
976
977 private AnnotationSet antecedentsAfter;
978
979 private AnnotationSet antecedentsBackInContext;
980
981 private Annotation quoteAnnotation;
982
983 private int quoteIndex;
984
985
986 public Quote(Annotation quoteAnnotation, int index) {
987
988 this.quoteAnnotation = quoteAnnotation;
989 this.quoteIndex = index;
990 init();
991 }
992
993
994 private void init() {
995
996 Assert.assertNotNull(textSentences);
998
999 FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl();
1001 prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
1002
1003
1005 int quoteStartPos = java.util.Arrays.binarySearch(textSentences,
1007 this.quoteAnnotation.getStartNode(),
1008 ANNOTATION_OFFSET_COMPARATOR);
1009
1010 int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos
1012 : -quoteStartPos -1 -1; if (startSentenceIndex < 0) {
1015 startSentenceIndex = 0;
1016 }
1017
1018 this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex,
1021 this.quoteIndex,
1022 ANTEC_BEFORE);
1023
1024
1025 if (startSentenceIndex > 0) {
1028 this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1,
1029 this.quoteIndex,
1030 ANTEC_BACK);
1031 }
1032
1033 int quoteEndPos = java.util.Arrays.binarySearch(textSentences,
1036 this.quoteAnnotation.getEndNode(),
1037 ANNOTATION_OFFSET_COMPARATOR);
1038
1039 int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos
1041 : -quoteEndPos -1 -1; if (endSentenceIndex < 0) {
1044 endSentenceIndex = 0;
1045 }
1046
1047 this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex,
1048 this.quoteIndex,
1049 ANTEC_AFTER);
1050 }
1052
1053
1054
1055 private AnnotationSet generateAntecedentCandidates(int sentenceNumber,
1056 int quoteNumber ,
1057 int mode) {
1058
1059 Assert.assertTrue(sentenceNumber >=0);
1061 Assert.assertTrue(quoteNumber >=0);
1062 Assert.assertTrue(mode == Quote.ANTEC_AFTER ||
1063 mode == Quote.ANTEC_BEFORE ||
1064 mode == Quote.ANTEC_BACK);
1065
1066 Sentence sentence = textSentences[sentenceNumber];
1068
1069 AnnotationSet antecedents = new AnnotationSetImpl(sentence.getPersons());
1071
1072 AnnotationSet annotations = null;
1074
1075 switch(mode) {
1076
1077 case ANTEC_BEFORE:
1078 annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1079 this.getStartOffset());
1080 break;
1081
1082 case ANTEC_AFTER:
1083 annotations = defaultAnnotations.getContained(this.getEndOffset(),
1084 sentence.getEndOffset());
1085 break;
1086
1087 case ANTEC_BACK:
1088 annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1089 sentence.getEndOffset());
1090 break;
1091 }
1092
1093 if (null != annotations) {
1096 AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION);
1097
1098 if (null != pronouns) {
1099
1100 Iterator it = pronouns.iterator();
1101 while (it.hasNext()) {
1102 Annotation currPronoun = (Annotation)it.next();
1103 String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1105
1106 if (null != pronounString &&
1107 (pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she"))
1108 )
1109 antecedents.add(currPronoun);
1110 } } }
1114
1115 Iterator itPersons = antecedents.iterator();
1131
1132 while (itPersons.hasNext()) {
1133 Annotation currPerson = (Annotation)itPersons.next();
1134
1135 if (Quote.ANTEC_BEFORE == mode &&
1137 currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) {
1138 itPersons.remove();
1140 }
1141 else if (Quote.ANTEC_AFTER == mode &&
1142 currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) {
1143 itPersons.remove();
1145 }
1146 else if (Quote.ANTEC_BACK == mode) {
1147
1153 if (quoteNumber >0) {
1156 Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1];
1157
1158 if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) {
1160 itPersons.remove();
1161 }
1162 }
1163 }
1164 }
1165
1166 return antecedents;
1167 }
1168
1169
1170 public Long getStartOffset() {
1171 return this.quoteAnnotation.getStartNode().getOffset();
1172 }
1173
1174
1175 public Long getEndOffset() {
1176 return this.quoteAnnotation.getEndNode().getOffset();
1177 }
1178
1179
1180 public AnnotationSet getAntecedentCandidates(int type) {
1181
1182 switch(type) {
1183
1184 case ANTEC_AFTER:
1185 return this.antecedentsAfter;
1186
1187 case ANTEC_BEFORE:
1188 return this.antecedentsBefore;
1189
1190 case ANTEC_BACK:
1191 return this.antecedentsBackInContext;
1192
1193 default:
1194 throw new IllegalArgumentException();
1195 }
1196 }
1197
1198 }
1199
1200
1201
1202 private class Sentence {
1203
1204
1205 private int sentNumber;
1206
1207 private int paraNumber;
1208
1209 private Long startOffset;
1210
1211 private Long endOffset;
1212
1213 private AnnotationSet persons;
1214
1215 private AnnotationSet organizations;
1216
1217 private AnnotationSet locations;
1218
1219
1220 public Sentence(int sentNumber,
1221 int paraNumber,
1222 Long startOffset,
1223 Long endOffset,
1224 AnnotationSet persons,
1225 AnnotationSet organizations,
1226 AnnotationSet locations) {
1227
1228 this.sentNumber = sentNumber;
1229 this.paraNumber = paraNumber;
1230 this.startOffset = startOffset;
1231 this.endOffset = endOffset;
1232 this.persons = persons;
1233 this.organizations = organizations;
1234 this.locations = locations;
1235 }
1236
1237
1238 public Long getStartOffset() {
1239 return this.startOffset;
1240 }
1241
1242
1243 public Long getEndOffset() {
1244 return this.endOffset;
1245 }
1246
1247
1248 public AnnotationSet getPersons() {
1249 return this.persons;
1250 }
1251
1252
1253 public AnnotationSet getOrganizations() {
1254 return this.organizations;
1255 }
1256
1257
1258 public AnnotationSet getLocations() {
1259 return this.locations;
1260 }
1261 }
1262
1263}
1264