1   /*
2    *  PronominalCoref.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Marin Dimitrov, 30/Dec/2001
12   *
13   *  $Id: PronominalCoref.java,v 1.28 2002/12/02 06:39:55 marin Exp $
14   */
15  
16  package gate.creole.coref;
17  
18  import java.util.*;
19  import java.net.*;
20  
21  import junit.framework.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.util.*;
26  import gate.annotation.*;
27  
28  public class PronominalCoref extends AbstractLanguageAnalyser
29                                implements ProcessingResource, ANNIEConstants{
30  
31    public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
32  
33    public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
34  
35    /** --- */
36    private static final boolean DEBUG = false;
37  
38    //JAPE grammars
39    private static final String QT_GRAMMAR_URL = "gate://gate/creole/coref/quoted_text.jape";
40    private static final String PLEON_GRAMMAR_URL = "gate://gate/creole/coref/pleonasm.jape";
41  
42    //annotation types
43    private static final String QUOTED_TEXT_TYPE = "QuotedText";
44    private static final String PLEONASTIC_TYPE = "PleonasticIt";
45  
46    //annotation features
47    private static final String PRP_CATEGORY = "PRP";
48    private static final String PRP$_CATEGORY = "PRP$";
49  
50    //scope
51    private static final int SENTENCES_IN_SCOPE = 3;
52    /** --- */
53    private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
54    /** --- */
55    private String annotationSetName;
56    /** --- */
57    private Transducer qtTransducer;
58    /** --- */
59    private Transducer pleonTransducer;
60    /** --- */
61    private AnnotationSet defaultAnnotations;
62    /** --- */
63    private Sentence[] textSentences;
64    /** --- */
65    private Quote[] quotedText;
66    /** --- */
67    private Annotation[] pleonasticIt;
68    /** --- */
69    private HashMap personGender;
70    /** --- */
71    private HashMap anaphor2antecedent;
72    /** --- */
73    private static final FeatureMap PRP_RESTRICTION;
74  
75    private boolean resolveIt = true;
76  
77    /** --- */
78    static {
79      ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
80      PRP_RESTRICTION = new SimpleFeatureMapImpl();
81      PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
82    }
83  
84    /** --- */
85    public PronominalCoref() {
86  
87      this.personGender = new HashMap();
88      this.anaphor2antecedent = new HashMap();
89      this.qtTransducer = new gate.creole.Transducer();
90      this.pleonTransducer = new gate.creole.Transducer();
91    }
92  
93    /** Initialise this resource, and return it. */
94    public Resource init() throws ResourceInstantiationException {
95  
96      //0. preconditions
97      Assert.assertNotNull(this.qtTransducer);
98  
99      //1. initialise quoted text transducer
100     URL qtGrammarURL = null;
101     try {
102       qtGrammarURL = new URL(QT_GRAMMAR_URL);
103     }
104     catch(MalformedURLException mue) {
105       throw new ResourceInstantiationException(mue);
106     }
107     this.qtTransducer.setGrammarURL(qtGrammarURL);
108     this.qtTransducer.setEncoding("UTF-8");
109     this.qtTransducer.init();
110 
111     //2. initialise pleonastic transducer
112     URL pleonGrammarURL = null;
113     try {
114       pleonGrammarURL = new URL(PLEON_GRAMMAR_URL);
115     }
116     catch(MalformedURLException mue) {
117       throw new ResourceInstantiationException(mue);
118     }
119     this.pleonTransducer.setGrammarURL(pleonGrammarURL);
120     this.pleonTransducer.setEncoding("UTF-8");
121     this.pleonTransducer.init();
122 
123 
124     //3. delegate
125     return super.init();
126   } // init()
127 
128   /**
129    * Reinitialises the processing resource. After calling this method the
130    * resource should be in the state it is after calling init.
131    * If the resource depends on external resources (such as rules files) then
132    * the resource will re-read those resources. If the data used to create
133    * the resource has changed since the resource has been created then the
134    * resource will change too after calling reInit().
135   */
136   public void reInit() throws ResourceInstantiationException {
137 
138     if (null != this.qtTransducer) {
139       this.qtTransducer.reInit();
140     }
141 
142     if (null != this.pleonTransducer) {
143       this.pleonTransducer.reInit();
144     }
145 
146     init();
147   } // reInit()
148 
149 
150   /** Set the document to run on. */
151   public void setDocument(Document newDocument) {
152 
153     //0. precondition
154 //    Assert.assertNotNull(newDocument);
155 
156     //1. set doc for aggregated components
157     this.qtTransducer.setDocument(newDocument);
158     this.pleonTransducer.setDocument(newDocument);
159 
160     //3. delegate
161     super.setDocument(newDocument);
162   }
163 
164   /** --- */
165   public void setAnnotationSetName(String annotationSetName) {
166     this.annotationSetName = annotationSetName;
167   }
168 
169 
170   /** --- */
171   public String getAnnotationSetName() {
172     return annotationSetName;
173   }
174 
175   /** --- */
176   public void setResolveIt(Boolean newValue) {
177     this.resolveIt = newValue.booleanValue();
178   }
179 
180   /** --- */
181   public Boolean getResolveIt() {
182     return new Boolean(this.resolveIt);
183   }
184 
185 
186   /**
187    * This method runs the coreferencer. It assumes that all the needed parameters
188    * are set. If they are not, an exception will be fired.
189    */
190   public void execute() throws ExecutionException{
191 
192     //0. preconditions
193     if(null == this.document) {
194       throw new ExecutionException("[coreference] Document is not set!");
195     }
196 
197     //1. preprocess
198     preprocess();
199 /*
200     //2. remove corefs from previous run
201     String annSetName = this.annotationSetName == null ? "COREF"
202                                                        : this.annotationSetName;
203 
204     AnnotationSet corefSet = this.document.getAnnotations(annSetName);
205     if (false == corefSet.isEmpty()) {
206       corefSet.clear();
207     }
208 */
209     //3.get personal pronouns
210     FeatureMap constraintPRP = new SimpleFeatureMapImpl();
211     constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
212     AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP);
213 
214     //4.get possesive pronouns
215     FeatureMap constraintPRP$ = new SimpleFeatureMapImpl();
216     constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY);
217     AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$);
218 
219     //5.combine them
220     AnnotationSet pronouns = personalPronouns;
221     if (null == personalPronouns) {
222       pronouns = possesivePronouns;
223     }
224     else if (null != possesivePronouns) {
225       pronouns.addAll(possesivePronouns);
226     }
227 
228     //6.do we have pronouns at all?
229     if (null == pronouns) {
230       //do nothing
231       return;
232     }
233 
234     //7.sort them according to offset
235     Object[] arrPronouns = pronouns.toArray();
236     java.util.Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR);
237 
238     //8.cleanup - ease the GC
239     pronouns = personalPronouns = possesivePronouns = null;
240 
241     int prnSentIndex = 0;
242 
243 
244     //10. process all pronouns
245     for (int i=0; i< arrPronouns.length; i++) {
246       Annotation currPronoun = (Annotation)arrPronouns[i];
247       while (this.textSentences[prnSentIndex].getEndOffset().longValue() <
248                                       currPronoun.getEndNode().getOffset().longValue()) {
249         prnSentIndex++;
250       }
251 
252       Sentence currSentence = this.textSentences[prnSentIndex];
253       Assert.assertTrue(currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue());
254       Assert.assertTrue(currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue());
255 
256       //11. find antecedent (if any) for pronoun
257       Annotation antc = findAntecedent(currPronoun,prnSentIndex);
258 
259       //12. add to the ana2ant hashtable
260       this.anaphor2antecedent.put(currPronoun,antc);
261     }
262 
263     //done
264   }
265 
266 
267   /** --- */
268   public HashMap getResolvedAnaphora() {
269     return this.anaphor2antecedent;
270   }
271 
272   /** --- */
273   private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) {
274 
275     //0. preconditions
276     Assert.assertNotNull(currPronoun);
277     Assert.assertTrue(prnSentIndex >= 0);
278     Assert.assertTrue(currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
279     Assert.assertTrue(currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
280                       currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
281 
282     //1.
283     String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
284 
285     Assert.assertNotNull(strPronoun);
286 
287     //2. delegate processing to the appropriate methods
288     if (strPronoun.equalsIgnoreCase("HE") ||
289         strPronoun.equalsIgnoreCase("HIM") ||
290         strPronoun.equalsIgnoreCase("HIS") ||
291         strPronoun.equalsIgnoreCase("HIMSELF")) {
292       return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex);
293     }
294     else if (strPronoun.equalsIgnoreCase("SHE") ||
295               strPronoun.equalsIgnoreCase("HER")) {
296       return _resolve$SHE$HER$(currPronoun,prnSentIndex);
297     }
298     else if (strPronoun.equalsIgnoreCase("IT") ||
299               strPronoun.equalsIgnoreCase("ITS") ||
300               strPronoun.equalsIgnoreCase("ITSELF")) {
301       return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex);
302     }
303     else if (strPronoun.equalsIgnoreCase("I") ||
304               strPronoun.equalsIgnoreCase("ME") ||
305               strPronoun.equalsIgnoreCase("MY") ||
306               strPronoun.equalsIgnoreCase("MYSELF")) {
307       return _resolve$I$ME$MY$MYSELF$(currPronoun,prnSentIndex);
308     }
309     else {
310       if (DEBUG) {
311         gate.util.Err.println("["+strPronoun+"] is not handled yet...");
312       }
313       return null;
314     }
315   }
316 
317 
318   boolean isPleonastic(Annotation pronoun) {
319 
320     //0. preconditions
321     Assert.assertNotNull(pronoun);
322     String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
323     Assert.assertTrue(str.equalsIgnoreCase("IT"));
324 
325     //1. do we have pleonasms in this text?
326     if (this.pleonasticIt.length == 0) {
327       return false;
328     }
329 
330     //2. find closest pleonasm index
331     int closestPleonasmIndex = java.util.Arrays.binarySearch(this.pleonasticIt,
332                                                              pronoun,
333                                                              ANNOTATION_OFFSET_COMPARATOR);
334     //normalize index
335     if (closestPleonasmIndex < 0) {
336       closestPleonasmIndex = -closestPleonasmIndex -1 -1;
337     }
338 
339     //still not good?
340     if (closestPleonasmIndex < 0) {
341       closestPleonasmIndex = 0;
342     }
343 
344     //get closest pleonasm
345     Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex];
346 
347 //System.out.println(pleonasm);
348 //System.out.println(pronoun);
349 
350     //3. return true only if the proboun is contained in pleonastic fragment
351     boolean result =  (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue()
352             &&
353             pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue());
354 //System.out.println("is pleon=["+result+"]");
355     return result;
356   }
357 
358 
359   /** --- */
360   private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) {
361 
362     //0. preconditions
363     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
364     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
365                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
366     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
367     Assert.assertTrue(pronounString.equalsIgnoreCase("HE") ||
368                       pronounString.equalsIgnoreCase("HIM") ||
369                       pronounString.equalsIgnoreCase("HIS") ||
370                       pronounString.equalsIgnoreCase("HIMSELF"));
371 
372     //1.
373     boolean antecedentFound = false;
374     int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
375     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
376 
377     int currSentenceIndex = sentenceIndex;
378     Annotation bestAntecedent = null;
379 
380     while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
381       Sentence currSentence = this.textSentences[currSentenceIndex];
382       AnnotationSet persons = currSentence.getPersons();
383 
384       Iterator it = persons.iterator();
385       while (it.hasNext()) {
386         Annotation currPerson = (Annotation)it.next();
387         String gender = (String)this.personGender.get(currPerson);
388 
389         if (null == gender ||
390             gender.equalsIgnoreCase("MALE") ||
391             gender.equalsIgnoreCase("UNKNOWN")) {
392           //hit
393           antecedentFound = true;
394 
395           if (null == bestAntecedent) {
396             bestAntecedent = currPerson;
397           }
398           else {
399             bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
400           }
401         }
402       }
403 
404       if (0 == currSentenceIndex--)
405         break;
406 
407     }
408 
409     return bestAntecedent;
410   }
411 
412 
413   /** --- */
414   private Annotation _resolve$SHE$HER$(Annotation pronoun, int sentenceIndex) {
415 
416     //0. preconditions
417     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
418     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
419                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
420     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
421     Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
422                       pronounString.equalsIgnoreCase("HER"));
423 
424     //1.
425     boolean antecedentFound = false;
426     int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
427     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
428     int currSentenceIndex = sentenceIndex;
429     Annotation bestAntecedent = null;
430 
431     while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
432       Sentence currSentence = this.textSentences[currSentenceIndex];
433       AnnotationSet persons = currSentence.getPersons();
434 
435       Iterator it = persons.iterator();
436       while (it.hasNext()) {
437         Annotation currPerson = (Annotation)it.next();
438         String gender = (String)this.personGender.get(currPerson);
439 
440         if (null == gender ||
441             gender.equalsIgnoreCase("FEMALE") ||
442             gender.equalsIgnoreCase("UNKNOWN")) {
443           //hit
444           antecedentFound = true;
445 
446           if (null == bestAntecedent) {
447             bestAntecedent = currPerson;
448           }
449           else {
450             bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
451           }
452         }
453       }
454 
455       if (0 == currSentenceIndex--)
456         break;
457     }
458 
459     return bestAntecedent;
460   }
461 
462 
463   /** --- */
464   private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) {
465     //do not resolve it pronouns if disabled by the user
466     if (! resolveIt)
467       return null;
468 
469     //0. preconditions
470     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
471     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
472                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
473     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
474     Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
475                       pronounString.equalsIgnoreCase("ITS") ||
476                       pronounString.equalsIgnoreCase("ITSELF"));
477 
478     //0.5 check if the IT is pleonastic
479     if (pronounString.equalsIgnoreCase("IT") &&
480         isPleonastic(pronoun)) {
481 //System.out.println("PLEONASM...");
482       return null;
483     }
484 
485     //1.
486     int scopeFirstIndex = sentenceIndex - 1;
487     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
488 
489     int currSentenceIndex = sentenceIndex;
490     Annotation bestAntecedent = null;
491 
492     while (currSentenceIndex >= scopeFirstIndex) {
493 
494       Sentence currSentence = this.textSentences[currSentenceIndex];
495       AnnotationSet org = currSentence.getOrganizations();
496       AnnotationSet loc = currSentence.getLocations();
497       //combine them
498       AnnotationSet org_loc = org;
499       org_loc.addAll(loc);
500 
501       Iterator it = org_loc.iterator();
502       while (it.hasNext()) {
503         Annotation currOrgLoc = (Annotation)it.next();
504 
505         if (null == bestAntecedent) {
506           //discard cataphoric references
507           if (currOrgLoc.getStartNode().getOffset().longValue() <
508                                           pronoun.getStartNode().getOffset().longValue()) {
509             bestAntecedent = currOrgLoc;
510           }
511         }
512         else {
513           bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun);
514         }
515       }
516 
517       if (0 == currSentenceIndex--)
518         break;
519     }
520 
521     return bestAntecedent;
522   }
523 
524 
525   /** --- */
526   private Annotation _resolve$I$ME$MY$MYSELF$(Annotation pronoun, int sentenceIndex) {
527 
528     //0. preconditions
529     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
530     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
531                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
532     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
533     Assert.assertTrue(pronounString.equalsIgnoreCase("I") ||
534                       pronounString.equalsIgnoreCase("MY") ||
535                       pronounString.equalsIgnoreCase("ME") ||
536                       pronounString.equalsIgnoreCase("MYSELF"));
537 
538     //0.5 sanity check
539     //if there are not quotes at all in the text then exit
540     if (0 == this.quotedText.length) {
541 //System.out.println("TEXT WITH NO QUOTES ENCOUNTERED...");
542       return null;
543     }
544 
545 
546     //1.
547     Annotation bestAntecedent = null;
548 
549     int closestQuoteIndex = java.util.Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR);
550     //normalize index
551     if (closestQuoteIndex < 0) {
552       closestQuoteIndex = -closestQuoteIndex -1 -1;
553     }
554 
555     //still not good?
556     if (closestQuoteIndex < 0) {
557       closestQuoteIndex = 0;
558     }
559 
560     //get closest Quote
561     Quote quoteContext = this.quotedText[closestQuoteIndex];
562 
563     //assure that the pronoun is contained in the quoted text fragment
564     //otherwise exit
565 
566     if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() ||
567         pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) {
568       //oops, probably incorrect text - I/My/Me is not part of quoted text fragment
569       //exit
570 //System.out.println("Oops! ["+pronounString+"] not part of quoted fragment...");
571       return null;
572     }
573 
574     //get the Persons that precede/succeed the quoted fragment
575     //the order is:
576     //
577     //[1]. if there exists a Person or pronoun in {he, she} following the quoted fragment but
578     //in the same sentence, then use it
579     //i.e.  ["PRN1(x)...", said X ...A, B, C ....]
580     //
581     //[2]. if there is a Person (NOT a pronoun) in the same sentence,
582     // preceding the quote, then use it
583     //i.e. . [A, B, C...X ..."PRN1(x) ..."...]
584     //
585 
586     //try [1]
587     //get the succeeding Persons/pronouns
588     AnnotationSet succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER);
589     if (false == succCandidates.isEmpty()) {
590       //cool, we have candidates, pick up the one closest to the end quote
591       Iterator it = succCandidates.iterator();
592 
593       while (it.hasNext()) {
594         Annotation currCandidate = (Annotation)it.next();
595         if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
596           //wow, we have a candidate that is closer to the quote
597           bestAntecedent = currCandidate;
598         }
599       }
600     }
601 
602     //try [2]
603     //get the preceding Persons/pronouns
604     if (null == bestAntecedent) {
605       AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE);
606       if (false == precCandidates.isEmpty()) {
607         //cool, we have candidates, pick up the one closest to the end quote
608         Iterator it = precCandidates.iterator();
609 
610         while (it.hasNext()) {
611           Annotation currCandidate = (Annotation)it.next();
612           if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) {
613             //wow, we have a candidate that is closer to the quote
614             bestAntecedent = currCandidate;
615           }
616         }
617       }
618     }
619 
620     //try [3]
621     //get the Persons/pronouns back in context
622     if (null == bestAntecedent) {
623       AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK);
624       if (false == precCandidates.isEmpty()) {
625         //cool, we have candidates, pick up the one closest to the end quote
626         Iterator it = precCandidates.iterator();
627 
628         while (it.hasNext()) {
629           Annotation currCandidate = (Annotation)it.next();
630           if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
631             //wow, we have a candidate that is closer to the quote
632             bestAntecedent = currCandidate;
633           }
634         }
635       }
636     }
637 
638     return bestAntecedent;
639   }
640 
641 
642   /** --- */
643   private void preprocess() throws ExecutionException {
644 
645     //0.5 cleanup
646     this.personGender.clear();
647     this.anaphor2antecedent.clear();
648 
649     //1.get all annotation in the input set
650     if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
651       this.defaultAnnotations = this.document.getAnnotations();
652     }
653     else {
654       this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
655     }
656 
657     //if none found, print warning and exit
658     if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
659       Err.prln("Coref Warning: No annotations found for processing!");
660       return;
661     }
662 
663 
664 
665     //2.1 remove QT annotations if left from previous execution
666     AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
667     if (null != qtSet) {
668       qtSet.clear();
669     }
670 
671     //2.2. run quoted text transducer to generate "Quoted Text" annotations
672     this.qtTransducer.execute();
673 
674     //3.1 remove pleonastic annotations if left from previous execution
675     AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
676     if (null != pleonSet) {
677       pleonSet.clear();
678     }
679 
680     //3.2 run quoted text transducer to generate "Pleonasm" annotations
681     this.pleonTransducer.execute();
682 
683     //4.get all SENTENCE annotations
684     AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE);
685 
686     this.textSentences = new Sentence[sentenceAnnotations.size()];
687     Object[]  sentenceArray = sentenceAnnotations.toArray();
688 
689     java.util.Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR);
690 
691     for (int i=0; i< sentenceArray.length; i++) {
692 
693       Annotation currSentence = (Annotation)sentenceArray[i];
694       Long sentStartOffset = currSentence.getStartNode().getOffset();
695       Long sentEndOffset = currSentence.getEndNode().getOffset();
696 
697       //4.1. get PERSOSNS in this sentence
698       AnnotationSet sentPersons = this.defaultAnnotations.get(PERSON_ANNOTATION_TYPE,
699                                                               sentStartOffset,
700                                                               sentEndOffset);
701 
702       //4.2. get ORGANIZATIONS in this sentence
703       AnnotationSet sentOrgs = this.defaultAnnotations.get(ORGANIZATION_ANNOTATION_TYPE,
704                                                               sentStartOffset,
705                                                               sentEndOffset);
706 
707       //4.3. get LOCATION in this sentence
708       AnnotationSet sentLocs = this.defaultAnnotations.get(LOCATION_ANNOTATION_TYPE,
709                                                               sentStartOffset,
710                                                               sentEndOffset);
711 
712       //4.5. create a Sentence for thei SENTENCE annotation
713       this.textSentences[i] = new Sentence(i,
714                                             0,
715                                             sentStartOffset,
716                                             sentEndOffset,
717                                             sentPersons,
718                                             sentOrgs,
719                                             sentLocs
720                                   );
721 
722       //4.6. for all PERSONs in the sentence - find their gender using the
723       //orthographic coreferences if the gender of some entity is unknown
724       Iterator itPersons = sentPersons.iterator();
725       while (itPersons.hasNext()) {
726         Annotation currPerson = (Annotation)itPersons.next();
727         String gender = this.findPersonGender(currPerson);
728         this.personGender.put(currPerson,gender);
729       }
730     }
731 
732     //5. initialise the quoted text fragments
733     AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
734 
735     //if none then return
736     if (null == sentQuotes) {
737       this.quotedText = new Quote[0];
738     }
739     else {
740       this.quotedText = new Quote[sentQuotes.size()];
741 
742       Object[] quotesArray = sentQuotes.toArray();
743       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
744 
745       for (int i =0; i < quotesArray.length; i++) {
746         this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
747       }
748     }
749 
750     //6. initialuse the plonastic It annotations
751     AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
752 
753     if (null == plaonasticSet) {
754       this.pleonasticIt = new Annotation[0];
755     }
756     else {
757       this.pleonasticIt = new Annotation[plaonasticSet.size()];
758 
759       Object[] quotesArray = plaonasticSet.toArray();
760       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
761 
762       for (int i=0; i< this.pleonasticIt.length; i++) {
763         this.pleonasticIt[i] = (Annotation)quotesArray[i];
764       }
765     }
766 
767   }
768 
769 
770   /** --- */
771   private String findPersonGender(Annotation person) {
772 
773     String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
774 
775     if (null==result) {
776       //gender is unknown - try to find it from the ortho coreferences
777       List orthoMatches  = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
778 
779       if (null != orthoMatches) {
780         Iterator itMatches = orthoMatches.iterator();
781 
782         while (itMatches.hasNext()) {
783           Integer correferringID = (Integer)itMatches.next();
784           Annotation coreferringEntity = this.defaultAnnotations.get(correferringID);
785           Assert.assertTrue(coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE));
786           String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
787 
788           if (null != correferringGender) {
789             result = correferringGender;
790             break;
791           }
792         }
793       }
794     }
795 
796     return result;
797   }
798 
799 
800   /** --- */
801   private static class AnnotationOffsetComparator implements Comparator {
802 
803     private int _getOffset(Object o) {
804 
805       if (o instanceof Annotation) {
806         return ((Annotation)o).getEndNode().getOffset().intValue();
807       }
808       else if (o instanceof Sentence) {
809         return ((Sentence)o).getStartOffset().intValue();
810       }
811       else if (o instanceof Quote) {
812         return ((Quote)o).getStartOffset().intValue();
813       }
814       else if (o instanceof Node) {
815         return ((Node)o).getOffset().intValue();
816       }
817       else {
818         throw new IllegalArgumentException();
819       }
820     }
821 
822     public int compare(Object o1,Object o2) {
823 
824       //0. preconditions
825       Assert.assertNotNull(o1);
826       Assert.assertNotNull(o2);
827       Assert.assertTrue(o1 instanceof Annotation ||
828                         o1 instanceof Sentence ||
829                         o1 instanceof Quote ||
830                         o1 instanceof Node);
831       Assert.assertTrue(o2 instanceof Annotation ||
832                         o2 instanceof Sentence ||
833                         o2 instanceof Quote ||
834                         o2 instanceof Node);
835 
836       int offset1 = _getOffset(o1);
837       int offset2 = _getOffset(o2);
838 
839       return offset1 - offset2;
840     }
841   }
842 
843 
844   /** --- */
845   private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
846 
847     //0. preconditions
848     Assert.assertNotNull(ant1);
849     Assert.assertNotNull(ant2);
850     Assert.assertNotNull(pronoun);
851     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
852                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
853     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
854     Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
855                       pronounString.equalsIgnoreCase("HER") ||
856                       pronounString.equalsIgnoreCase("HE") ||
857                       pronounString.equalsIgnoreCase("HIM") ||
858                       pronounString.equalsIgnoreCase("HIS") ||
859                       pronounString.equalsIgnoreCase("HIMSELF"));
860 
861     Long offset1 = ant1.getStartNode().getOffset();
862     Long offset2 = ant2.getStartNode().getOffset();
863     Long offsetPrn = pronoun.getStartNode().getOffset();
864 
865     long diff1 = offsetPrn.longValue() - offset1.longValue();
866     long diff2 = offsetPrn.longValue() - offset2.longValue();
867 //    Assert.assertTrue(diff1 != 0 && diff2 != 0);
868     //reject candidates that overlap with the pronoun
869     if (diff1 == 0) {
870       return ant2;
871     }
872     else if (diff2 == 0) {
873       return ant1;
874     }
875 
876     //get the one CLOSEST AND PRECEDING the pronoun
877     if (diff1 > 0 && diff2 > 0) {
878       //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
879       if (diff1 < diff2)
880         return ant1;
881       else
882         return ant2;
883     }
884     else if (diff1 < 0 && diff2 < 0) {
885       //we have [...pronoun ...antecedentA...AntecedentB.......] ==> choose A
886       if (Math.abs(diff1) < Math.abs(diff2))
887         return ant1;
888       else
889           return ant2;
890     }
891     else {
892       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
893       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
894       if (diff1 > 0)
895         return ant1;
896       else
897         return ant2;
898     }
899   }
900 
901   /** --- */
902   private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
903 
904     //0. preconditions
905     Assert.assertNotNull(ant1);
906     Assert.assertNotNull(ant2);
907     Assert.assertNotNull(pronoun);
908     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
909                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
910     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
911 
912     Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
913                       pronounString.equalsIgnoreCase("ITS") ||
914                       pronounString.equalsIgnoreCase("ITSELF"));
915 
916     Long offset1 = ant1.getStartNode().getOffset();
917     Long offset2 = ant2.getStartNode().getOffset();
918     Long offsetPrn = pronoun.getStartNode().getOffset();
919     long diff1 = offsetPrn.longValue() - offset1.longValue();
920     long diff2 = offsetPrn.longValue() - offset2.longValue();
921 //    Assert.assertTrue(diff1 != 0 && diff2 != 0);
922     //reject candidates that overlap with the pronoun
923     if (diff1 == 0) {
924       return ant2;
925     }
926     else if (diff2 == 0) {
927       return ant1;
928     }
929 
930 
931     //get the one CLOSEST AND PRECEDING the pronoun
932     if (diff1 > 0 && diff2 > 0) {
933       //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
934       if (diff1 < diff2)
935         return ant1;
936       else
937         return ant2;
938     }
939     else if (diff1 > 0){
940       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
941       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
942       return ant1;
943     }
944     else if (diff2 > 0){
945       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
946       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
947       return ant2;
948     }
949     else {
950       //both possible antecedents are BEHIND the anaophoric pronoun - i.e. we have either
951       //cataphora, or nominal antecedent, or an antecedent that is further back in scope
952       //in any case - discard the antecedents
953       return null;
954     }
955   }
956 
957 
958   /** --- */
959   private class Quote {
960 
961     /** --- */
962     public static final int ANTEC_AFTER = 1;
963     /** --- */
964     public static final int ANTEC_BEFORE = 2;
965     /** --- */
966     public static final int ANTEC_BACK = 3;
967     /** --- */
968     private AnnotationSet antecedentsBefore;
969     /** --- */
970     private AnnotationSet antecedentsAfter;
971     /** --- */
972     private AnnotationSet antecedentsBackInContext;
973     /** --- */
974     private Annotation quoteAnnotation;
975     /** --- */
976     private int quoteIndex;
977 
978     /** --- */
979     public Quote(Annotation quoteAnnotation, int index) {
980 
981       this.quoteAnnotation = quoteAnnotation;
982       this.quoteIndex = index;
983       init();
984     }
985 
986     /** --- */
987     private void init() {
988 
989       //0.preconditions
990       Assert.assertNotNull(textSentences);
991 
992       //0.5 create a restriction for PRP pos tokens
993       FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl();
994       prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
995 
996       //1. generate the precPersons set
997 
998       //1.1 locate the sentece containing the opening quote marks
999       int quoteStartPos = java.util.Arrays.binarySearch(textSentences,
1000                                                        this.quoteAnnotation.getStartNode(),
1001                                                        ANNOTATION_OFFSET_COMPARATOR);
1002
1003      //normalize index
1004      int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos
1005                                                  : -quoteStartPos -1 -1; // blame Sun, not me
1006      //still not good?
1007      if (startSentenceIndex < 0) {
1008        startSentenceIndex = 0;
1009      }
1010
1011      //1.2. get the persons and restrict to these that precede the quote (i.e. not contained
1012      //in the quote)
1013      this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex,
1014                                                            this.quoteIndex,
1015                                                            ANTEC_BEFORE);
1016
1017
1018      //2. generate the precPersonsInCOntext set
1019      //2.1. get the persons from the sentence precedeing the sentence containing the quote start
1020      if (startSentenceIndex > 0) {
1021        this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1,
1022                                                                    this.quoteIndex,
1023                                                                    ANTEC_BACK);
1024      }
1025
1026      //2. generate the succ  Persons set
1027      //2.1 locate the sentece containing the closing quote marks
1028      int quoteEndPos = java.util.Arrays.binarySearch(textSentences,
1029                                                        this.quoteAnnotation.getEndNode(),
1030                                                        ANNOTATION_OFFSET_COMPARATOR);
1031
1032      //normalize it
1033      int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos
1034                                              : -quoteEndPos -1 -1; // blame Sun, not me
1035      //still not good?
1036      if (endSentenceIndex < 0) {
1037        endSentenceIndex = 0;
1038      }
1039
1040      this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex,
1041                                                            this.quoteIndex,
1042                                                            ANTEC_AFTER);
1043      //generate t
1044    }
1045
1046
1047    /** --- */
1048    private AnnotationSet generateAntecedentCandidates(int sentenceNumber,
1049                                                        int quoteNumber ,
1050                                                        int mode) {
1051
1052      //0. preconditions
1053      Assert.assertTrue(sentenceNumber >=0);
1054      Assert.assertTrue(quoteNumber >=0);
1055      Assert.assertTrue(mode == Quote.ANTEC_AFTER ||
1056                        mode == Quote.ANTEC_BEFORE ||
1057                        mode == Quote.ANTEC_BACK);
1058
1059      //1. get sentence
1060     Sentence sentence = textSentences[sentenceNumber];
1061
1062      //2. get the persons
1063      AnnotationSet antecedents = new AnnotationSetImpl(sentence.getPersons());
1064
1065      //4. now get the he/she pronouns in the relevant context
1066      AnnotationSet annotations = null;
1067
1068      switch(mode) {
1069
1070        case ANTEC_BEFORE:
1071          annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1072                                                      this.getStartOffset());
1073          break;
1074
1075        case ANTEC_AFTER:
1076          annotations = defaultAnnotations.getContained(this.getEndOffset(),
1077                                                     sentence.getEndOffset());
1078          break;
1079
1080        case ANTEC_BACK:
1081          annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1082                                                     sentence.getEndOffset());
1083          break;
1084      }
1085
1086      //4. get the pronouns
1087      //restrict to he/she pronouns
1088      if (null != annotations) {
1089        AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION);
1090
1091        if (null != pronouns) {
1092
1093          Iterator it = pronouns.iterator();
1094          while (it.hasNext()) {
1095            Annotation currPronoun = (Annotation)it.next();
1096            //add to succPersons only if HE/SHE
1097            String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1098
1099            if (null != pronounString &&
1100                (pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she"))
1101                )
1102              antecedents.add(currPronoun);
1103          }//while
1104        }//if
1105      }//if
1106
1107
1108      //3. depending on the mode, may have to restrict persons to these that precede/succeed
1109      //the quoted fragment
1110      //
1111      //for ANTEC_BEFORE, get the ones #preceding# the quote, contained in the sentence where
1112      //the quote *starts*
1113      //
1114      //for ANTEC_AFTER, get the ones #succeeding# the quote, contained in the sentence where
1115      //the quote *ends*
1116      //
1117      //for ANTEC_BACK, we are operating in the context of the sentence previous to the
1118      //sentence where the quote starts. I.e. we're resolbinf a case like
1119      // [sss "q1q1q1q1" s1s1s1s1]["q2q2q2q2"]
1120      //...and we want to get the entities from the s1s1 part - they *succeed* the #previous# quote
1121      //Note that the cirrent sentence is the first one, not the second
1122      //
1123      Iterator itPersons = antecedents.iterator();
1124
1125      while (itPersons.hasNext()) {
1126        Annotation currPerson = (Annotation)itPersons.next();
1127
1128        //cut
1129        if (Quote.ANTEC_BEFORE == mode &&
1130            currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) {
1131          //restrict only to persosn preceding
1132          itPersons.remove();
1133        }
1134        else if (Quote.ANTEC_AFTER == mode &&
1135                currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) {
1136          //restrict only to persons succeeding the quote
1137          itPersons.remove();
1138        }
1139        else if (Quote.ANTEC_BACK == mode) {
1140          //this one is tricky
1141          //locate the quote previous to the one we're resolving
1142          //(since we're operating in the sentence previous to the quote being resolved
1143          //wew try to find if any quote (prevQuote) exist in this sentence and get the
1144          //persons succeeding it)
1145
1146          //get prev quote
1147          //is the curr quote the first one?
1148          if (quoteNumber >0) {
1149            Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1];
1150
1151            //restrict to the succeeding persons
1152            if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) {
1153              itPersons.remove();
1154            }
1155          }
1156        }
1157      }
1158
1159      return antecedents;
1160    }
1161
1162    /** --- */
1163    public Long getStartOffset() {
1164      return this.quoteAnnotation.getStartNode().getOffset();
1165    }
1166
1167    /** --- */
1168    public Long getEndOffset() {
1169      return this.quoteAnnotation.getEndNode().getOffset();
1170    }
1171
1172    /** --- */
1173    public AnnotationSet getAntecedentCandidates(int type) {
1174
1175      switch(type) {
1176
1177        case ANTEC_AFTER:
1178          return this.antecedentsAfter;
1179
1180        case ANTEC_BEFORE:
1181          return this.antecedentsBefore;
1182
1183        case ANTEC_BACK:
1184          return this.antecedentsBackInContext;
1185
1186        default:
1187          throw new IllegalArgumentException();
1188      }
1189    }
1190
1191  }
1192
1193
1194  /** --- */
1195  private class Sentence {
1196
1197    /** --- */
1198    private int sentNumber;
1199    /** --- */
1200    private int paraNumber;
1201    /** --- */
1202    private Long startOffset;
1203    /** --- */
1204    private Long endOffset;
1205    /** --- */
1206    private AnnotationSet persons;
1207    /** --- */
1208    private AnnotationSet organizations;
1209    /** --- */
1210    private AnnotationSet locations;
1211
1212    /** --- */
1213    public Sentence(int sentNumber,
1214                    int paraNumber,
1215                    Long startOffset,
1216                    Long endOffset,
1217                    AnnotationSet persons,
1218                    AnnotationSet organizations,
1219                    AnnotationSet locations) {
1220
1221      this.sentNumber = sentNumber;
1222      this.paraNumber = paraNumber;
1223      this.startOffset = startOffset;
1224      this.endOffset = endOffset;
1225      this.persons = persons;
1226      this.organizations = organizations;
1227      this.locations = locations;
1228    }
1229
1230    /** --- */
1231    public Long getStartOffset() {
1232      return this.startOffset;
1233    }
1234
1235    /** --- */
1236    public Long getEndOffset() {
1237      return this.endOffset;
1238    }
1239
1240    /** --- */
1241    public AnnotationSet getPersons() {
1242      return this.persons;
1243    }
1244
1245    /** --- */
1246    public AnnotationSet getOrganizations() {
1247      return this.organizations;
1248    }
1249
1250    /** --- */
1251    public AnnotationSet getLocations() {
1252      return this.locations;
1253    }
1254  }
1255
1256}
1257