1   /*
2    *  PronominalCoref.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Marin Dimitrov, 30/Dec/2001
12   *
13   *  $Id: PronominalCoref.java,v 1.32 2004/08/04 14:44:26 niraj Exp $
14   */
15  
16  package gate.creole.coref;
17  
18  import java.net.MalformedURLException;
19  import java.net.URL;
20  import java.util.*;
21  
22  import junit.framework.Assert;
23  
24  import gate.*;
25  import gate.annotation.AnnotationSetImpl;
26  import gate.creole.*;
27  import gate.util.*;
28  import gate.util.Err;
29  import gate.util.SimpleFeatureMapImpl;
30  
31  public class PronominalCoref extends AbstractLanguageAnalyser
32                                implements ProcessingResource, ANNIEConstants{
33  
34    public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
35  
36    public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
37  
38    /** --- */
39    private static final boolean DEBUG = false;
40  
41    //JAPE grammars
42    private static final String QT_GRAMMAR_URL = Gate.class.
43        getResource(Files.getResourcePath() +
44                "/creole/coref/quoted_text.jape").toString();
45    private static final String PLEON_GRAMMAR_URL = Gate.class.getResource(
46            Files.getResourcePath() +
47            "/creole/coref/pleonasm.jape").toString();
48  
49    //annotation types
50    private static final String QUOTED_TEXT_TYPE = "QuotedText";
51    private static final String PLEONASTIC_TYPE = "PleonasticIt";
52  
53    //annotation features
54    private static final String PRP_CATEGORY = "PRP";
55    private static final String PRP$_CATEGORY = "PRP$";
56  
57    //scope
58    private static final int SENTENCES_IN_SCOPE = 3;
59    /** --- */
60    private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
61    /** --- */
62    private String annotationSetName;
63    /** --- */
64    private Transducer qtTransducer;
65    /** --- */
66    private Transducer pleonTransducer;
67    /** --- */
68    private AnnotationSet defaultAnnotations;
69    /** --- */
70    private Sentence[] textSentences;
71    /** --- */
72    private Quote[] quotedText;
73    /** --- */
74    private Annotation[] pleonasticIt;
75    /** --- */
76    private HashMap personGender;
77    /** --- */
78    private HashMap anaphor2antecedent;
79    /** --- */
80    private static final FeatureMap PRP_RESTRICTION;
81  
82    private boolean resolveIt = true;
83  
84    /** --- */
85    static {
86      ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
87      PRP_RESTRICTION = new SimpleFeatureMapImpl();
88      PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
89    }
90  
91    /** --- */
92    public PronominalCoref() {
93  
94      this.personGender = new HashMap();
95      this.anaphor2antecedent = new HashMap();
96      this.qtTransducer = new gate.creole.Transducer();
97      this.pleonTransducer = new gate.creole.Transducer();
98    }
99  
100   /** Initialise this resource, and return it. */
101   public Resource init() throws ResourceInstantiationException {
102 
103     //0. preconditions
104     Assert.assertNotNull(this.qtTransducer);
105 
106     //1. initialise quoted text transducer
107     URL qtGrammarURL = null;
108     try {
109       qtGrammarURL = new URL(QT_GRAMMAR_URL);
110     }
111     catch(MalformedURLException mue) {
112       throw new ResourceInstantiationException(mue);
113     }
114     this.qtTransducer.setGrammarURL(qtGrammarURL);
115     this.qtTransducer.setEncoding("UTF-8");
116     this.qtTransducer.init();
117 
118     //2. initialise pleonastic transducer
119     URL pleonGrammarURL = null;
120     try {
121       pleonGrammarURL = new URL(PLEON_GRAMMAR_URL);
122     }
123     catch(MalformedURLException mue) {
124       throw new ResourceInstantiationException(mue);
125     }
126     this.pleonTransducer.setGrammarURL(pleonGrammarURL);
127     this.pleonTransducer.setEncoding("UTF-8");
128     this.pleonTransducer.init();
129 
130 
131     //3. delegate
132     return super.init();
133   } // init()
134 
135   /**
136    * Reinitialises the processing resource. After calling this method the
137    * resource should be in the state it is after calling init.
138    * If the resource depends on external resources (such as rules files) then
139    * the resource will re-read those resources. If the data used to create
140    * the resource has changed since the resource has been created then the
141    * resource will change too after calling reInit().
142   */
143   public void reInit() throws ResourceInstantiationException {
144 
145     if (null != this.qtTransducer) {
146       this.qtTransducer.reInit();
147     }
148 
149     if (null != this.pleonTransducer) {
150       this.pleonTransducer.reInit();
151     }
152 
153     init();
154   } // reInit()
155 
156 
157   /** Set the document to run on. */
158   public void setDocument(Document newDocument) {
159 
160     //0. precondition
161 //    Assert.assertNotNull(newDocument);
162 
163     //1. set doc for aggregated components
164     this.qtTransducer.setDocument(newDocument);
165     this.pleonTransducer.setDocument(newDocument);
166 
167     //3. delegate
168     super.setDocument(newDocument);
169   }
170 
171   /** --- */
172   public void setAnnotationSetName(String annotationSetName) {
173     this.annotationSetName = annotationSetName;
174   }
175 
176 
177   /** --- */
178   public String getAnnotationSetName() {
179     return annotationSetName;
180   }
181 
182   /** --- */
183   public void setResolveIt(Boolean newValue) {
184     this.resolveIt = newValue.booleanValue();
185   }
186 
187   /** --- */
188   public Boolean getResolveIt() {
189     return new Boolean(this.resolveIt);
190   }
191 
192 
193   /**
194    * This method runs the coreferencer. It assumes that all the needed parameters
195    * are set. If they are not, an exception will be fired.
196    */
197   public void execute() throws ExecutionException{
198 
199     //0. preconditions
200     if(null == this.document) {
201       throw new ExecutionException("[coreference] Document is not set!");
202     }
203 
204     //1. preprocess
205     preprocess();
206 /*
207     //2. remove corefs from previous run
208     String annSetName = this.annotationSetName == null ? "COREF"
209                                                        : this.annotationSetName;
210 
211     AnnotationSet corefSet = this.document.getAnnotations(annSetName);
212     if (false == corefSet.isEmpty()) {
213       corefSet.clear();
214     }
215 */
216     //3.get personal pronouns
217     FeatureMap constraintPRP = new SimpleFeatureMapImpl();
218     constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
219     AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP);
220 
221     //4.get possesive pronouns
222     FeatureMap constraintPRP$ = new SimpleFeatureMapImpl();
223     constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY);
224     AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$);
225 
226     //5.combine them
227     AnnotationSet pronouns = personalPronouns;
228     if (null == personalPronouns) {
229       pronouns = possesivePronouns;
230     }
231     else if (null != possesivePronouns) {
232       pronouns.addAll(possesivePronouns);
233     }
234 
235     //6.do we have pronouns at all?
236     if (null == pronouns) {
237       //do nothing
238       return;
239     }
240 
241     //7.sort them according to offset
242     Object[] arrPronouns = pronouns.toArray();
243     java.util.Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR);
244 
245     //8.cleanup - ease the GC
246     pronouns = personalPronouns = possesivePronouns = null;
247 
248     int prnSentIndex = 0;
249 
250 
251     //10. process all pronouns
252     for (int i=0; i< arrPronouns.length; i++) {
253       Annotation currPronoun = (Annotation)arrPronouns[i];
254       while (this.textSentences[prnSentIndex].getEndOffset().longValue() <
255                                       currPronoun.getEndNode().getOffset().longValue()) {
256         prnSentIndex++;
257       }
258 
259       Sentence currSentence = this.textSentences[prnSentIndex];
260       Assert.assertTrue(currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue());
261       Assert.assertTrue(currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue());
262 
263       //11. find antecedent (if any) for pronoun
264       Annotation antc = findAntecedent(currPronoun,prnSentIndex);
265 
266       //12. add to the ana2ant hashtable
267       this.anaphor2antecedent.put(currPronoun,antc);
268     }
269 
270     //done
271   }
272 
273 
274   /** --- */
275   public HashMap getResolvedAnaphora() {
276     return this.anaphor2antecedent;
277   }
278 
279   /** --- */
280   private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) {
281 
282     //0. preconditions
283     Assert.assertNotNull(currPronoun);
284     Assert.assertTrue(prnSentIndex >= 0);
285     Assert.assertTrue(currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
286     Assert.assertTrue(currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
287                       currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
288 
289     //1.
290     String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
291 
292     Assert.assertNotNull(strPronoun);
293 
294     //2. delegate processing to the appropriate methods
295     if (strPronoun.equalsIgnoreCase("HE") ||
296         strPronoun.equalsIgnoreCase("HIM") ||
297         strPronoun.equalsIgnoreCase("HIS") ||
298         strPronoun.equalsIgnoreCase("HIMSELF")) {
299       return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex);
300     }
301     else if (strPronoun.equalsIgnoreCase("SHE") ||
302               strPronoun.equalsIgnoreCase("HER")) {
303       return _resolve$SHE$HER$(currPronoun,prnSentIndex);
304     }
305     else if (strPronoun.equalsIgnoreCase("IT") ||
306               strPronoun.equalsIgnoreCase("ITS") ||
307               strPronoun.equalsIgnoreCase("ITSELF")) {
308       return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex);
309     }
310     else if (strPronoun.equalsIgnoreCase("I") ||
311               strPronoun.equalsIgnoreCase("ME") ||
312               strPronoun.equalsIgnoreCase("MY") ||
313               strPronoun.equalsIgnoreCase("MYSELF")) {
314       return _resolve$I$ME$MY$MYSELF$(currPronoun,prnSentIndex);
315     }
316     else {
317       if (DEBUG) {
318         gate.util.Err.println("["+strPronoun+"] is not handled yet...");
319       }
320       return null;
321     }
322   }
323 
324 
325   boolean isPleonastic(Annotation pronoun) {
326 
327     //0. preconditions
328     Assert.assertNotNull(pronoun);
329     String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
330     Assert.assertTrue(str.equalsIgnoreCase("IT"));
331 
332     //1. do we have pleonasms in this text?
333     if (this.pleonasticIt.length == 0) {
334       return false;
335     }
336 
337     //2. find closest pleonasm index
338     int closestPleonasmIndex = java.util.Arrays.binarySearch(this.pleonasticIt,
339                                                              pronoun,
340                                                              ANNOTATION_OFFSET_COMPARATOR);
341     //normalize index
342     if (closestPleonasmIndex < 0) {
343       closestPleonasmIndex = -closestPleonasmIndex -1 -1;
344     }
345 
346     //still not good?
347     if (closestPleonasmIndex < 0) {
348       closestPleonasmIndex = 0;
349     }
350 
351     //get closest pleonasm
352     Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex];
353 
354 //System.out.println(pleonasm);
355 //System.out.println(pronoun);
356 
357     //3. return true only if the proboun is contained in pleonastic fragment
358     boolean result =  (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue()
359             &&
360             pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue());
361 //System.out.println("is pleon=["+result+"]");
362     return result;
363   }
364 
365 
366   /** --- */
367   private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) {
368 
369     //0. preconditions
370     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
371     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
372                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
373     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
374     Assert.assertTrue(pronounString.equalsIgnoreCase("HE") ||
375                       pronounString.equalsIgnoreCase("HIM") ||
376                       pronounString.equalsIgnoreCase("HIS") ||
377                       pronounString.equalsIgnoreCase("HIMSELF"));
378 
379     //1.
380     boolean antecedentFound = false;
381     int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
382     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
383 
384     int currSentenceIndex = sentenceIndex;
385     Annotation bestAntecedent = null;
386 
387     while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
388       Sentence currSentence = this.textSentences[currSentenceIndex];
389       AnnotationSet persons = currSentence.getPersons();
390 
391       Iterator it = persons.iterator();
392       while (it.hasNext()) {
393         Annotation currPerson = (Annotation)it.next();
394         String gender = (String)this.personGender.get(currPerson);
395 
396         if (null == gender ||
397             gender.equalsIgnoreCase("MALE") ||
398             gender.equalsIgnoreCase("UNKNOWN")) {
399           //hit
400           antecedentFound = true;
401 
402           if (null == bestAntecedent) {
403             bestAntecedent = currPerson;
404           }
405           else {
406             bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
407           }
408         }
409       }
410 
411       if (0 == currSentenceIndex--)
412         break;
413 
414     }
415 
416     return bestAntecedent;
417   }
418 
419 
420   /** --- */
421   private Annotation _resolve$SHE$HER$(Annotation pronoun, int sentenceIndex) {
422 
423     //0. preconditions
424     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
425     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
426                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
427     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
428     Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
429                       pronounString.equalsIgnoreCase("HER"));
430 
431     //1.
432     boolean antecedentFound = false;
433     int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
434     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
435     int currSentenceIndex = sentenceIndex;
436     Annotation bestAntecedent = null;
437 
438     while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
439       Sentence currSentence = this.textSentences[currSentenceIndex];
440       AnnotationSet persons = currSentence.getPersons();
441 
442       Iterator it = persons.iterator();
443       while (it.hasNext()) {
444         Annotation currPerson = (Annotation)it.next();
445         String gender = (String)this.personGender.get(currPerson);
446 
447         if (null == gender ||
448             gender.equalsIgnoreCase("FEMALE") ||
449             gender.equalsIgnoreCase("UNKNOWN")) {
450           //hit
451           antecedentFound = true;
452 
453           if (null == bestAntecedent) {
454             bestAntecedent = currPerson;
455           }
456           else {
457             bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
458           }
459         }
460       }
461 
462       if (0 == currSentenceIndex--)
463         break;
464     }
465 
466     return bestAntecedent;
467   }
468 
469 
470   /** --- */
471   private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) {
472     //do not resolve it pronouns if disabled by the user
473     if (! resolveIt)
474       return null;
475 
476     //0. preconditions
477     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
478     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
479                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
480     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
481     Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
482                       pronounString.equalsIgnoreCase("ITS") ||
483                       pronounString.equalsIgnoreCase("ITSELF"));
484 
485     //0.5 check if the IT is pleonastic
486     if (pronounString.equalsIgnoreCase("IT") &&
487         isPleonastic(pronoun)) {
488 //System.out.println("PLEONASM...");
489       return null;
490     }
491 
492     //1.
493     int scopeFirstIndex = sentenceIndex - 1;
494     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
495 
496     int currSentenceIndex = sentenceIndex;
497     Annotation bestAntecedent = null;
498 
499     while (currSentenceIndex >= scopeFirstIndex) {
500 
501       Sentence currSentence = this.textSentences[currSentenceIndex];
502       AnnotationSet org = currSentence.getOrganizations();
503       AnnotationSet loc = currSentence.getLocations();
504       //combine them
505       AnnotationSet org_loc = org;
506       org_loc.addAll(loc);
507 
508       Iterator it = org_loc.iterator();
509       while (it.hasNext()) {
510         Annotation currOrgLoc = (Annotation)it.next();
511 
512         if (null == bestAntecedent) {
513           //discard cataphoric references
514           if (currOrgLoc.getStartNode().getOffset().longValue() <
515                                           pronoun.getStartNode().getOffset().longValue()) {
516             bestAntecedent = currOrgLoc;
517           }
518         }
519         else {
520           bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun);
521         }
522       }
523 
524       if (0 == currSentenceIndex--)
525         break;
526     }
527 
528     return bestAntecedent;
529   }
530 
531 
532   /** --- */
533   private Annotation _resolve$I$ME$MY$MYSELF$(Annotation pronoun, int sentenceIndex) {
534 
535     //0. preconditions
536     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
537     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
538                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
539     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
540     Assert.assertTrue(pronounString.equalsIgnoreCase("I") ||
541                       pronounString.equalsIgnoreCase("MY") ||
542                       pronounString.equalsIgnoreCase("ME") ||
543                       pronounString.equalsIgnoreCase("MYSELF"));
544 
545     //0.5 sanity check
546     //if there are not quotes at all in the text then exit
547     if (0 == this.quotedText.length) {
548 //System.out.println("TEXT WITH NO QUOTES ENCOUNTERED...");
549       return null;
550     }
551 
552 
553     //1.
554     Annotation bestAntecedent = null;
555 
556     int closestQuoteIndex = java.util.Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR);
557     //normalize index
558     if (closestQuoteIndex < 0) {
559       closestQuoteIndex = -closestQuoteIndex -1 -1;
560     }
561 
562     //still not good?
563     if (closestQuoteIndex < 0) {
564       closestQuoteIndex = 0;
565     }
566 
567     //get closest Quote
568     Quote quoteContext = this.quotedText[closestQuoteIndex];
569 
570     //assure that the pronoun is contained in the quoted text fragment
571     //otherwise exit
572 
573     if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() ||
574         pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) {
575       //oops, probably incorrect text - I/My/Me is not part of quoted text fragment
576       //exit
577 //System.out.println("Oops! ["+pronounString+"] not part of quoted fragment...");
578       return null;
579     }
580 
581     //get the Persons that precede/succeed the quoted fragment
582     //the order is:
583     //
584     //[1]. if there exists a Person or pronoun in {he, she} following the quoted fragment but
585     //in the same sentence, then use it
586     //i.e.  ["PRN1(x)...", said X ...A, B, C ....]
587     //
588     //[2]. if there is a Person (NOT a pronoun) in the same sentence,
589     // preceding the quote, then use it
590     //i.e. . [A, B, C...X ..."PRN1(x) ..."...]
591     //
592 
593     //try [1]
594     //get the succeeding Persons/pronouns
595     AnnotationSet succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER);
596     if (false == succCandidates.isEmpty()) {
597       //cool, we have candidates, pick up the one closest to the end quote
598       Iterator it = succCandidates.iterator();
599 
600       while (it.hasNext()) {
601         Annotation currCandidate = (Annotation)it.next();
602         if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
603           //wow, we have a candidate that is closer to the quote
604           bestAntecedent = currCandidate;
605         }
606       }
607     }
608 
609     //try [2]
610     //get the preceding Persons/pronouns
611     if (null == bestAntecedent) {
612       AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE);
613       if (false == precCandidates.isEmpty()) {
614         //cool, we have candidates, pick up the one closest to the end quote
615         Iterator it = precCandidates.iterator();
616 
617         while (it.hasNext()) {
618           Annotation currCandidate = (Annotation)it.next();
619           if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) {
620             //wow, we have a candidate that is closer to the quote
621             bestAntecedent = currCandidate;
622           }
623         }
624       }
625     }
626 
627     //try [3]
628     //get the Persons/pronouns back in context
629     if (null == bestAntecedent) {
630       AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK);
631       if (false == precCandidates.isEmpty()) {
632         //cool, we have candidates, pick up the one closest to the end quote
633         Iterator it = precCandidates.iterator();
634 
635         while (it.hasNext()) {
636           Annotation currCandidate = (Annotation)it.next();
637           if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
638             //wow, we have a candidate that is closer to the quote
639             bestAntecedent = currCandidate;
640           }
641         }
642       }
643     }
644 
645     return bestAntecedent;
646   }
647 
648 
649   /** --- */
650   private void preprocess() throws ExecutionException {
651 
652     //0.5 cleanup
653     this.personGender.clear();
654     this.anaphor2antecedent.clear();
655 
656     //1.get all annotation in the input set
657     if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
658       this.defaultAnnotations = this.document.getAnnotations();
659     }
660     else {
661       this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
662     }
663 
664     //if none found, print warning and exit
665     if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
666       Err.prln("Coref Warning: No annotations found for processing!");
667       return;
668     }
669 
670 
671 
672     //2.1 remove QT annotations if left from previous execution
673     AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
674     if (null != qtSet) {
675       qtSet.clear();
676     }
677 
678     //2.2. run quoted text transducer to generate "Quoted Text" annotations
679     this.qtTransducer.execute();
680 
681     //3.1 remove pleonastic annotations if left from previous execution
682     AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
683     if (null != pleonSet) {
684       pleonSet.clear();
685     }
686 
687     //3.2 run quoted text transducer to generate "Pleonasm" annotations
688     this.pleonTransducer.execute();
689 
690     //4.get all SENTENCE annotations
691     AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE);
692 
693     this.textSentences = new Sentence[sentenceAnnotations.size()];
694     Object[]  sentenceArray = sentenceAnnotations.toArray();
695 
696     java.util.Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR);
697 
698     for (int i=0; i< sentenceArray.length; i++) {
699 
700       Annotation currSentence = (Annotation)sentenceArray[i];
701       Long sentStartOffset = currSentence.getStartNode().getOffset();
702       Long sentEndOffset = currSentence.getEndNode().getOffset();
703 
704       //4.1. get PERSOSNS in this sentence
705       AnnotationSet sentPersons = this.defaultAnnotations.get(PERSON_ANNOTATION_TYPE,
706                                                               sentStartOffset,
707                                                               sentEndOffset);
708 
709       //4.2. get ORGANIZATIONS in this sentence
710       AnnotationSet sentOrgs = this.defaultAnnotations.get(ORGANIZATION_ANNOTATION_TYPE,
711                                                               sentStartOffset,
712                                                               sentEndOffset);
713 
714       //4.3. get LOCATION in this sentence
715       AnnotationSet sentLocs = this.defaultAnnotations.get(LOCATION_ANNOTATION_TYPE,
716                                                               sentStartOffset,
717                                                               sentEndOffset);
718 
719       //4.5. create a Sentence for thei SENTENCE annotation
720       this.textSentences[i] = new Sentence(i,
721                                             0,
722                                             sentStartOffset,
723                                             sentEndOffset,
724                                             sentPersons,
725                                             sentOrgs,
726                                             sentLocs
727                                   );
728 
729       //4.6. for all PERSONs in the sentence - find their gender using the
730       //orthographic coreferences if the gender of some entity is unknown
731       Iterator itPersons = sentPersons.iterator();
732       while (itPersons.hasNext()) {
733         Annotation currPerson = (Annotation)itPersons.next();
734         String gender = this.findPersonGender(currPerson);
735         this.personGender.put(currPerson,gender);
736       }
737     }
738 
739     //5. initialise the quoted text fragments
740     AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
741 
742     //if none then return
743     if (null == sentQuotes) {
744       this.quotedText = new Quote[0];
745     }
746     else {
747       this.quotedText = new Quote[sentQuotes.size()];
748 
749       Object[] quotesArray = sentQuotes.toArray();
750       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
751 
752       for (int i =0; i < quotesArray.length; i++) {
753         this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
754       }
755     }
756 
757     //6. initialuse the plonastic It annotations
758     AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
759 
760     if (null == plaonasticSet) {
761       this.pleonasticIt = new Annotation[0];
762     }
763     else {
764       this.pleonasticIt = new Annotation[plaonasticSet.size()];
765 
766       Object[] quotesArray = plaonasticSet.toArray();
767       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
768 
769       for (int i=0; i< this.pleonasticIt.length; i++) {
770         this.pleonasticIt[i] = (Annotation)quotesArray[i];
771       }
772     }
773 
774   }
775 
776 
777   /** --- */
778   private String findPersonGender(Annotation person) {
779 
780     String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
781 
782     if (null==result) {
783       //gender is unknown - try to find it from the ortho coreferences
784       List orthoMatches  = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
785 
786       if (null != orthoMatches) {
787         Iterator itMatches = orthoMatches.iterator();
788 
789         while (itMatches.hasNext()) {
790           Integer correferringID = (Integer)itMatches.next();
791           Annotation coreferringEntity = this.defaultAnnotations.get(correferringID);
792           Assert.assertTrue(coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE));
793           String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
794 
795           if (null != correferringGender) {
796             result = correferringGender;
797             break;
798           }
799         }
800       }
801     }
802 
803     return result;
804   }
805 
806 
807   /** --- */
808   private static class AnnotationOffsetComparator implements Comparator {
809 
810     private int _getOffset(Object o) {
811 
812       if (o instanceof Annotation) {
813         return ((Annotation)o).getEndNode().getOffset().intValue();
814       }
815       else if (o instanceof Sentence) {
816         return ((Sentence)o).getStartOffset().intValue();
817       }
818       else if (o instanceof Quote) {
819         return ((Quote)o).getStartOffset().intValue();
820       }
821       else if (o instanceof Node) {
822         return ((Node)o).getOffset().intValue();
823       }
824       else {
825         throw new IllegalArgumentException();
826       }
827     }
828 
829     public int compare(Object o1,Object o2) {
830 
831       //0. preconditions
832       Assert.assertNotNull(o1);
833       Assert.assertNotNull(o2);
834       Assert.assertTrue(o1 instanceof Annotation ||
835                         o1 instanceof Sentence ||
836                         o1 instanceof Quote ||
837                         o1 instanceof Node);
838       Assert.assertTrue(o2 instanceof Annotation ||
839                         o2 instanceof Sentence ||
840                         o2 instanceof Quote ||
841                         o2 instanceof Node);
842 
843       int offset1 = _getOffset(o1);
844       int offset2 = _getOffset(o2);
845 
846       return offset1 - offset2;
847     }
848   }
849 
850 
851   /** --- */
852   private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
853 
854     //0. preconditions
855     Assert.assertNotNull(ant1);
856     Assert.assertNotNull(ant2);
857     Assert.assertNotNull(pronoun);
858     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
859                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
860     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
861     Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
862                       pronounString.equalsIgnoreCase("HER") ||
863                       pronounString.equalsIgnoreCase("HE") ||
864                       pronounString.equalsIgnoreCase("HIM") ||
865                       pronounString.equalsIgnoreCase("HIS") ||
866                       pronounString.equalsIgnoreCase("HIMSELF"));
867 
868     Long offset1 = ant1.getStartNode().getOffset();
869     Long offset2 = ant2.getStartNode().getOffset();
870     Long offsetPrn = pronoun.getStartNode().getOffset();
871 
872     long diff1 = offsetPrn.longValue() - offset1.longValue();
873     long diff2 = offsetPrn.longValue() - offset2.longValue();
874 //    Assert.assertTrue(diff1 != 0 && diff2 != 0);
875     //reject candidates that overlap with the pronoun
876     if (diff1 == 0) {
877       return ant2;
878     }
879     else if (diff2 == 0) {
880       return ant1;
881     }
882 
883     //get the one CLOSEST AND PRECEDING the pronoun
884     if (diff1 > 0 && diff2 > 0) {
885       //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
886       if (diff1 < diff2)
887         return ant1;
888       else
889         return ant2;
890     }
891     else if (diff1 < 0 && diff2 < 0) {
892       //we have [...pronoun ...antecedentA...AntecedentB.......] ==> choose A
893       if (Math.abs(diff1) < Math.abs(diff2))
894         return ant1;
895       else
896           return ant2;
897     }
898     else {
899       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
900       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
901       if (diff1 > 0)
902         return ant1;
903       else
904         return ant2;
905     }
906   }
907 
908   /** --- */
909   private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
910 
911     //0. preconditions
912     Assert.assertNotNull(ant1);
913     Assert.assertNotNull(ant2);
914     Assert.assertNotNull(pronoun);
915     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
916                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
917     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
918 
919     Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
920                       pronounString.equalsIgnoreCase("ITS") ||
921                       pronounString.equalsIgnoreCase("ITSELF"));
922 
923     Long offset1 = ant1.getStartNode().getOffset();
924     Long offset2 = ant2.getStartNode().getOffset();
925     Long offsetPrn = pronoun.getStartNode().getOffset();
926     long diff1 = offsetPrn.longValue() - offset1.longValue();
927     long diff2 = offsetPrn.longValue() - offset2.longValue();
928 //    Assert.assertTrue(diff1 != 0 && diff2 != 0);
929     //reject candidates that overlap with the pronoun
930     if (diff1 == 0) {
931       return ant2;
932     }
933     else if (diff2 == 0) {
934       return ant1;
935     }
936 
937 
938     //get the one CLOSEST AND PRECEDING the pronoun
939     if (diff1 > 0 && diff2 > 0) {
940       //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
941       if (diff1 < diff2)
942         return ant1;
943       else
944         return ant2;
945     }
946     else if (diff1 > 0){
947       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
948       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
949       return ant1;
950     }
951     else if (diff2 > 0){
952       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
953       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
954       return ant2;
955     }
956     else {
957       //both possible antecedents are BEHIND the anaophoric pronoun - i.e. we have either
958       //cataphora, or nominal antecedent, or an antecedent that is further back in scope
959       //in any case - discard the antecedents
960       return null;
961     }
962   }
963 
964 
965   /** --- */
966   private class Quote {
967 
968     /** --- */
969     public static final int ANTEC_AFTER = 1;
970     /** --- */
971     public static final int ANTEC_BEFORE = 2;
972     /** --- */
973     public static final int ANTEC_BACK = 3;
974     /** --- */
975     private AnnotationSet antecedentsBefore;
976     /** --- */
977     private AnnotationSet antecedentsAfter;
978     /** --- */
979     private AnnotationSet antecedentsBackInContext;
980     /** --- */
981     private Annotation quoteAnnotation;
982     /** --- */
983     private int quoteIndex;
984 
985     /** --- */
986     public Quote(Annotation quoteAnnotation, int index) {
987 
988       this.quoteAnnotation = quoteAnnotation;
989       this.quoteIndex = index;
990       init();
991     }
992 
993     /** --- */
994     private void init() {
995 
996       //0.preconditions
997       Assert.assertNotNull(textSentences);
998 
999       //0.5 create a restriction for PRP pos tokens
1000      FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl();
1001      prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
1002
1003      //1. generate the precPersons set
1004
1005      //1.1 locate the sentece containing the opening quote marks
1006      int quoteStartPos = java.util.Arrays.binarySearch(textSentences,
1007                                                        this.quoteAnnotation.getStartNode(),
1008                                                        ANNOTATION_OFFSET_COMPARATOR);
1009
1010      //normalize index
1011      int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos
1012                                                  : -quoteStartPos -1 -1; // blame Sun, not me
1013      //still not good?
1014      if (startSentenceIndex < 0) {
1015        startSentenceIndex = 0;
1016      }
1017
1018      //1.2. get the persons and restrict to these that precede the quote (i.e. not contained
1019      //in the quote)
1020      this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex,
1021                                                            this.quoteIndex,
1022                                                            ANTEC_BEFORE);
1023
1024
1025      //2. generate the precPersonsInCOntext set
1026      //2.1. get the persons from the sentence precedeing the sentence containing the quote start
1027      if (startSentenceIndex > 0) {
1028        this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1,
1029                                                                    this.quoteIndex,
1030                                                                    ANTEC_BACK);
1031      }
1032
1033      //2. generate the succ  Persons set
1034      //2.1 locate the sentece containing the closing quote marks
1035      int quoteEndPos = java.util.Arrays.binarySearch(textSentences,
1036                                                        this.quoteAnnotation.getEndNode(),
1037                                                        ANNOTATION_OFFSET_COMPARATOR);
1038
1039      //normalize it
1040      int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos
1041                                              : -quoteEndPos -1 -1; // blame Sun, not me
1042      //still not good?
1043      if (endSentenceIndex < 0) {
1044        endSentenceIndex = 0;
1045      }
1046
1047      this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex,
1048                                                            this.quoteIndex,
1049                                                            ANTEC_AFTER);
1050      //generate t
1051    }
1052
1053
1054    /** --- */
1055    private AnnotationSet generateAntecedentCandidates(int sentenceNumber,
1056                                                        int quoteNumber ,
1057                                                        int mode) {
1058
1059      //0. preconditions
1060      Assert.assertTrue(sentenceNumber >=0);
1061      Assert.assertTrue(quoteNumber >=0);
1062      Assert.assertTrue(mode == Quote.ANTEC_AFTER ||
1063                        mode == Quote.ANTEC_BEFORE ||
1064                        mode == Quote.ANTEC_BACK);
1065
1066      //1. get sentence
1067     Sentence sentence = textSentences[sentenceNumber];
1068
1069      //2. get the persons
1070      AnnotationSet antecedents = new AnnotationSetImpl(sentence.getPersons());
1071
1072      //4. now get the he/she pronouns in the relevant context
1073      AnnotationSet annotations = null;
1074
1075      switch(mode) {
1076
1077        case ANTEC_BEFORE:
1078          annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1079                                                      this.getStartOffset());
1080          break;
1081
1082        case ANTEC_AFTER:
1083          annotations = defaultAnnotations.getContained(this.getEndOffset(),
1084                                                     sentence.getEndOffset());
1085          break;
1086
1087        case ANTEC_BACK:
1088          annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1089                                                     sentence.getEndOffset());
1090          break;
1091      }
1092
1093      //4. get the pronouns
1094      //restrict to he/she pronouns
1095      if (null != annotations) {
1096        AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION);
1097
1098        if (null != pronouns) {
1099
1100          Iterator it = pronouns.iterator();
1101          while (it.hasNext()) {
1102            Annotation currPronoun = (Annotation)it.next();
1103            //add to succPersons only if HE/SHE
1104            String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1105
1106            if (null != pronounString &&
1107                (pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she"))
1108                )
1109              antecedents.add(currPronoun);
1110          }//while
1111        }//if
1112      }//if
1113
1114
1115      //3. depending on the mode, may have to restrict persons to these that precede/succeed
1116      //the quoted fragment
1117      //
1118      //for ANTEC_BEFORE, get the ones #preceding# the quote, contained in the sentence where
1119      //the quote *starts*
1120      //
1121      //for ANTEC_AFTER, get the ones #succeeding# the quote, contained in the sentence where
1122      //the quote *ends*
1123      //
1124      //for ANTEC_BACK, we are operating in the context of the sentence previous to the
1125      //sentence where the quote starts. I.e. we're resolbinf a case like
1126      // [sss "q1q1q1q1" s1s1s1s1]["q2q2q2q2"]
1127      //...and we want to get the entities from the s1s1 part - they *succeed* the #previous# quote
1128      //Note that the cirrent sentence is the first one, not the second
1129      //
1130      Iterator itPersons = antecedents.iterator();
1131
1132      while (itPersons.hasNext()) {
1133        Annotation currPerson = (Annotation)itPersons.next();
1134
1135        //cut
1136        if (Quote.ANTEC_BEFORE == mode &&
1137            currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) {
1138          //restrict only to persosn preceding
1139          itPersons.remove();
1140        }
1141        else if (Quote.ANTEC_AFTER == mode &&
1142                currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) {
1143          //restrict only to persons succeeding the quote
1144          itPersons.remove();
1145        }
1146        else if (Quote.ANTEC_BACK == mode) {
1147          //this one is tricky
1148          //locate the quote previous to the one we're resolving
1149          //(since we're operating in the sentence previous to the quote being resolved
1150          //wew try to find if any quote (prevQuote) exist in this sentence and get the
1151          //persons succeeding it)
1152
1153          //get prev quote
1154          //is the curr quote the first one?
1155          if (quoteNumber >0) {
1156            Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1];
1157
1158            //restrict to the succeeding persons
1159            if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) {
1160              itPersons.remove();
1161            }
1162          }
1163        }
1164      }
1165
1166      return antecedents;
1167    }
1168
1169    /** --- */
1170    public Long getStartOffset() {
1171      return this.quoteAnnotation.getStartNode().getOffset();
1172    }
1173
1174    /** --- */
1175    public Long getEndOffset() {
1176      return this.quoteAnnotation.getEndNode().getOffset();
1177    }
1178
1179    /** --- */
1180    public AnnotationSet getAntecedentCandidates(int type) {
1181
1182      switch(type) {
1183
1184        case ANTEC_AFTER:
1185          return this.antecedentsAfter;
1186
1187        case ANTEC_BEFORE:
1188          return this.antecedentsBefore;
1189
1190        case ANTEC_BACK:
1191          return this.antecedentsBackInContext;
1192
1193        default:
1194          throw new IllegalArgumentException();
1195      }
1196    }
1197
1198  }
1199
1200
1201  /** --- */
1202  private class Sentence {
1203
1204    /** --- */
1205    private int sentNumber;
1206    /** --- */
1207    private int paraNumber;
1208    /** --- */
1209    private Long startOffset;
1210    /** --- */
1211    private Long endOffset;
1212    /** --- */
1213    private AnnotationSet persons;
1214    /** --- */
1215    private AnnotationSet organizations;
1216    /** --- */
1217    private AnnotationSet locations;
1218
1219    /** --- */
1220    public Sentence(int sentNumber,
1221                    int paraNumber,
1222                    Long startOffset,
1223                    Long endOffset,
1224                    AnnotationSet persons,
1225                    AnnotationSet organizations,
1226                    AnnotationSet locations) {
1227
1228      this.sentNumber = sentNumber;
1229      this.paraNumber = paraNumber;
1230      this.startOffset = startOffset;
1231      this.endOffset = endOffset;
1232      this.persons = persons;
1233      this.organizations = organizations;
1234      this.locations = locations;
1235    }
1236
1237    /** --- */
1238    public Long getStartOffset() {
1239      return this.startOffset;
1240    }
1241
1242    /** --- */
1243    public Long getEndOffset() {
1244      return this.endOffset;
1245    }
1246
1247    /** --- */
1248    public AnnotationSet getPersons() {
1249      return this.persons;
1250    }
1251
1252    /** --- */
1253    public AnnotationSet getOrganizations() {
1254      return this.organizations;
1255    }
1256
1257    /** --- */
1258    public AnnotationSet getLocations() {
1259      return this.locations;
1260    }
1261  }
1262
1263}
1264