1   /*
2    *  NominalCoref.java
3    *
4    *  Copyright (c) 1998-2002, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  $Id: NominalCoref.java,v 1.11 2003/01/27 17:12:31 diana Exp $
12   */
13  
14  package gate.creole.coref;
15  
16  import java.util.*;
17  import java.net.*;
18  
19  import junit.framework.*;
20  
21  import gate.*;
22  import gate.creole.*;
23  import gate.util.*;
24  import gate.annotation.*;
25  
26  public class NominalCoref extends AbstractCoreferencer
27      implements ProcessingResource, ANNIEConstants {
28  
29    public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
30  
31    public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
32  
33    /** --- */
34    private static final boolean DEBUG = false;
35  
36    //annotation features
37    private static final String PERSON_CATEGORY = "Person";
38    private static final String JOBTITLE_CATEGORY = "JobTitle";
39    private static final String ORGANIZATION_CATEGORY = "Organization";
40    private static final String LOOKUP_CATEGORY = "Lookup";
41    private static final String ORGANIZATION_NOUN_CATEGORY = "organization_noun";
42    
43  
44    //scope
45    /** --- */
46    //private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
47    /** --- */
48    private String annotationSetName;
49    /** --- */
50    private AnnotationSet defaultAnnotations;
51    /** --- */
52    private HashMap anaphor2antecedent;
53  
54      /*  static {
55      ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
56      }*/
57  
58    /** --- */
59    public NominalCoref() {
60      super("NOMINAL");
61      this.anaphor2antecedent = new HashMap();
62    }
63  
64    /** Initialise this resource, and return it. */
65    public Resource init() throws ResourceInstantiationException {
66      return super.init();
67    } // init()
68  
69    /**
70     * Reinitialises the processing resource. After calling this method the
71     * resource should be in the state it is after calling init.
72     * If the resource depends on external resources (such as rules files) then
73     * the resource will re-read those resources. If the data used to create
74     * the resource has changed since the resource has been created then the
75     * resource will change too after calling reInit().
76    */
77    public void reInit() throws ResourceInstantiationException {
78      this.anaphor2antecedent = new HashMap();
79      init();
80    } // reInit()
81  
82  
83    /** Set the document to run on. */
84    public void setDocument(Document newDocument) {
85  
86      //0. precondition
87  //    Assert.assertNotNull(newDocument);
88  
89      super.setDocument(newDocument);
90    }
91  
92    /** --- */
93    public void setAnnotationSetName(String annotationSetName) {
94      this.annotationSetName = annotationSetName;
95    }
96  
97    /** --- */
98    public String getAnnotationSetName() {
99      return annotationSetName;
100   }
101 
102   /**
103    * This method runs the coreferencer. It assumes that all the needed parameters
104    * are set. If they are not, an exception will be fired.
105    *
106    * The process goes like this:
107    * - Create a sorted list of Person and JobTitle annotations.
108    * - Loop through the annotations
109    *    If it is a Person, we add it to the top of a stack.
110    *    If it is a job title, we subject it to a series of tests. If it 
111    *      passes, we associate it with the Person annotation at the top
112    *      of the stack
113    */
114   public void execute() throws ExecutionException{
115 
116     HashMap anaphorToAntecedent = new HashMap();
117     Object[] nominalArray;
118 
119     //0. preconditions
120     if (null == this.document) {
121       throw new ExecutionException("[coreference] Document is not set!");
122     }
123 
124     //1. preprocess
125     preprocess();
126 
127     // Out.println("Total annotations: " + defaultAnnotations.size());
128 
129     // Get a sorted array of Tokens.
130     // The tests for job titles often require getting previous and subsequent
131     // tokens, so to save work, we create a single, sorted list of 
132     // tokens.
133     Object[] tokens = defaultAnnotations.get(TOKEN_ANNOTATION_TYPE).toArray();
134     java.util.Arrays.sort(tokens, new OffsetComparator());
135 
136     // The current token is the token at the start of the current annotation.
137     int currentToken = 0;
138 
139     // get Person entities
140     //FeatureMap personConstraint = new SimpleFeatureMapImpl();
141     //personConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
142     //                          PERSON_CATEGORY);
143     HashSet personConstraint = new HashSet();
144     personConstraint.add(PERSON_CATEGORY);
145     AnnotationSet people =
146       this.defaultAnnotations.get(personConstraint);
147 
148     // get all JobTitle entities
149     //FeatureMap constraintJobTitle = new SimpleFeatureMapImpl();
150     //constraintJobTitle.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, JOBTITLE_CATEGORY);
151     HashSet jobTitleConstraint = new HashSet();
152     jobTitleConstraint.add(JOBTITLE_CATEGORY);
153     
154     AnnotationSet jobTitles = 
155       this.defaultAnnotations.get(jobTitleConstraint);
156 
157     FeatureMap orgNounConstraint = new SimpleFeatureMapImpl();
158     orgNounConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
159                           ORGANIZATION_NOUN_CATEGORY);
160     AnnotationSet orgNouns =
161       this.defaultAnnotations.get(LOOKUP_CATEGORY, orgNounConstraint);
162 
163     HashSet orgConstraint = new HashSet();
164     orgConstraint.add(ORGANIZATION_CATEGORY);
165 
166     AnnotationSet organizations =
167       this.defaultAnnotations.get(orgConstraint);
168 
169     // combine them into a list of nominals
170     Set nominals = new HashSet();
171     if (people != null) {
172       nominals.addAll(people);
173     }
174     if (jobTitles != null) {
175       nominals.addAll(jobTitles);
176     }
177     if (orgNouns != null) {
178       nominals.addAll(orgNouns);
179     }
180     if (organizations != null) {
181       nominals.addAll(organizations);
182     }
183 
184     //  Out.println("total nominals: " + nominals.size());
185 
186     // sort them according to offset
187     nominalArray = nominals.toArray();
188     java.util.Arrays.sort(nominalArray, new OffsetComparator());
189     
190     ArrayList previousPeople = new ArrayList();
191     ArrayList previousOrgs = new ArrayList();
192     
193         
194     // process all nominals
195     for (int i=0; i<nominalArray.length; i++) {
196       Annotation nominal = (Annotation)nominalArray[i];
197       
198       // Find the current place in the tokens array
199       currentToken = advanceTokenPosition(nominal, currentToken, tokens);
200       
201       //Out.print("processing nominal [" + stringValue(nominal) + "] ");
202       
203       if (nominal.getType().equals(PERSON_CATEGORY)) {
204   // Add each Person entity to the beginning of the people list
205   // but don't add pronouns
206   Object[] personTokens = getSortedTokens(nominal);
207     
208   if (personTokens.length == 1) {
209     Annotation personToken = (Annotation) personTokens[0];
210     
211     String personCategory = (String) 
212       personToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
213     if (personCategory.equals("PP") ||
214         personCategory.equals("PRP") ||
215         personCategory.equals("PRP$") ||
216         personCategory.equals("PRPR$")) {
217         //Out.println("ignoring personal pronoun");
218         continue;
219     }
220   }
221   
222   previousPeople.add(0, nominal);
223   //Out.println("added person");
224       }
225       else if (nominal.getType().equals(JOBTITLE_CATEGORY)) {
226     
227   // Look into the tokens to get some info about POS.
228   Object[] jobTitleTokens = getSortedTokens(nominal);
229   
230   Annotation lastToken = (Annotation)
231     jobTitleTokens[jobTitleTokens.length - 1];
232   
233   // Don't associate if the job title is not a singular noun
234   String tokenCategory = (String) 
235     lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
236   // UNCOMMENT FOR SINGULAR PROPER NOUNS (The President, the Pope)
237   //if (! tokenCategory.equals("NN") &&
238   //! tokenCategory.equals("NNP")) {
239   if (! tokenCategory.equals("NN")) {
240       // Out.println("Not a singular noun");
241     continue;
242   }
243   
244   // Don't associate it if it's part of a Person (eg President Bush)
245   if (overlapsAnnotations(nominal, people)) {
246       //Out.println("overlapping annotation");
247     continue;
248   }
249 
250   Annotation previousToken;
251         String previousValue;
252 
253   // Don't associate it if it's proceeded by a generic marker
254         if (currentToken != 0) {
255           previousToken = (Annotation) tokens[currentToken - 1];
256           previousValue = (String) 
257       previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
258           if (previousValue.equalsIgnoreCase("a") ||
259               previousValue.equalsIgnoreCase("an") ||
260               previousValue.equalsIgnoreCase("other") ||
261               previousValue.equalsIgnoreCase("another")) {
262               //Out.println("indefinite");
263       continue;
264           }
265         }
266 
267   // nominals immediately followed by Person annotations:
268   // BAD:
269   //   Chairman Bill Gates               (title)
270   // GOOD:
271   //   secretary of state, Colin Powell  (inverted appositive)
272   //   the home secretary David Blunkett (same but no comma, 
273   //                                      possible in transcriptions)
274   // "the" is a good indicator for apposition
275   
276   // Luckily we have an array of all Person annotations in order...
277   if (i < nominalArray.length - 1) {
278     Annotation nextAnnotation = (Annotation) nominalArray[i+1];
279     if (nextAnnotation.getType().equals(PERSON_CATEGORY)) {
280       // is it preceded by a definite article?
281       previousToken = (Annotation) tokens[currentToken - 1];
282       previousValue = (String) 
283         previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
284       
285       // Get all tokens between this and the next person
286       int interveningTokens =
287         countInterveningTokens(nominal, nextAnnotation,
288              currentToken, tokens);
289       if (interveningTokens == 0 && 
290         ! previousValue.equalsIgnoreCase("the")) {
291       
292         // There is nothing between the job title and the person,
293         // like "Chairman Gates" -- do nothing.
294         //Out.println("immediately followed by Person");
295         continue;
296       }
297       else if (interveningTokens == 1) {
298         String tokenString =
299           (String) getFollowingToken(nominal,
300              currentToken, tokens)
301       .getFeatures().get(TOKEN_STRING_FEATURE_NAME);
302         //Out.print("STRING VALUE [" + tokenString + "] ");
303         if (! tokenString.equals(",") &&
304     ! tokenString.equals("-")) {
305     //Out.println("nominal and person separated by NOT [,-]");
306     continue;
307         }
308       }
309       
310       // Did we get through all that? Then we must have an 
311       // apposition.
312       
313       anaphor2antecedent.put(nominal, nextAnnotation);
314       //Out.println("associating with " +
315       //  stringValue(nextAnnotation));
316       continue;
317       
318     }
319   }
320   
321   // If we have no possible antecedents, create a new Person
322   // annotation.
323   if (previousPeople.size() == 0) {
324     FeatureMap personFeatures = new SimpleFeatureMapImpl();
325     personFeatures.put("ENTITY_MENTION_TYPE", "NOMINAL");
326     this.defaultAnnotations.add(nominal.getStartNode(),
327               nominal.getEndNode(),
328               PERSON_CATEGORY,
329               personFeatures);
330     //Out.println("creating as new Person");
331     continue;
332   }
333 
334   // Associate this entity with the most recent Person
335   int personIndex = 0;
336   
337   Annotation previousPerson =
338     (Annotation) previousPeople.get(personIndex);
339   
340   // Don't associate if the two nominals are not the same gender
341   String personGender = (String) 
342     previousPerson.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
343   String jobTitleGender = (String) 
344           nominal.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
345   if (personGender != null && jobTitleGender != null) {
346           if (! personGender.equals(jobTitleGender)) {
347             //Out.println("wrong gender: " + personGender + " " +
348             //            jobTitleGender);
349       continue;
350     }
351   }
352   
353   //Out.println("associating with " +
354   //  previousPerson.getFeatures()
355   //  .get(TOKEN_STRING_FEATURE_NAME));
356   
357   anaphor2antecedent.put(nominal, previousPerson);
358       }
359       else if (nominal.getType().equals(ORGANIZATION_CATEGORY)) {
360         // Add each organization entity to the beginning of
361   // the organization list
362   previousOrgs.add(0, nominal);
363   //Out.println("added organization");
364       }
365       else if (nominal.getType().equals(LOOKUP_CATEGORY)) {
366   // Don't associate it if we have no organizations
367   if (previousOrgs.size() == 0) {
368     //Out.println("no orgs");
369     continue;
370   }
371     
372   // Look into the tokens to get some info about POS.
373   Object[] orgNounTokens =
374     this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,
375               nominal.getStartNode().getOffset(),
376               nominal.getEndNode().getOffset()).toArray();
377   java.util.Arrays.sort(orgNounTokens, new OffsetComparator());
378   Annotation lastToken = (Annotation)
379     orgNounTokens[orgNounTokens.length - 1];
380   
381   // Don't associate if the org noun is not a singular noun
382   if (! lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)
383       .equals("NN")) {
384       //Out.println("Not a singular noun");
385       continue;
386   }
387   
388   //Out.println("organization noun");
389   // Associate this entity with the most recent Person
390   anaphor2antecedent.put(nominal, previousOrgs.get(0));
391       }
392     }
393 
394     // This method does the dirty work of actually adding new annotations and
395     // coreferring.
396     generateCorefChains(anaphor2antecedent);
397   }
398 
399   /**
400    * This method specifies whether a given annotation overlaps any of a 
401    * set of annotations. For instance, JobTitles occasionally are
402    * part of Person annotations.
403    * 
404    */
405   private boolean overlapsAnnotations(Annotation a,
406                                       AnnotationSet annotations) {
407     Iterator iter = annotations.iterator();
408     while (iter.hasNext()) {
409       Annotation current = (Annotation) iter.next();
410       if (a.overlaps(current)) {
411         return true;
412       }
413     }
414       
415     return false;
416   }
417 
418   /** Use this method to keep the current token pointer at the right point
419    * in the token list */
420   private int advanceTokenPosition(Annotation target, int currentPosition,
421            Object[] tokens) {
422     long targetOffset = target.getStartNode().getOffset().longValue();
423     long currentOffset = ((Annotation) tokens[currentPosition])
424       .getStartNode().getOffset().longValue();
425     
426     if (targetOffset > currentOffset) {
427       while (targetOffset > currentOffset) {
428   currentPosition++;
429   currentOffset = ((Annotation) tokens[currentPosition])
430           .getStartNode().getOffset().longValue();
431       }
432     }
433     else if (targetOffset < currentOffset) {
434       while (targetOffset < currentOffset) {
435   currentPosition--;
436   currentOffset = ((Annotation) tokens[currentPosition])
437           .getStartNode().getOffset().longValue();
438       }
439     }
440     
441     return currentPosition;
442   }
443 
444   /** Return the number of tokens between the end of annotation 1 and the
445    * beginning of annotation 2. Will return 0 if they are not in order */
446   private int countInterveningTokens(Annotation first, Annotation second,
447              int currentPosition, Object[] tokens) {
448     int interveningTokens = 0;
449 
450     long startOffset = first.getEndNode().getOffset().longValue();
451     long endOffset = second.getStartNode().getOffset().longValue();
452     
453     long currentOffset = ((Annotation) tokens[currentPosition])
454       .getStartNode().getOffset().longValue();
455     
456     while (currentOffset < endOffset) {
457       if (currentOffset >= startOffset) {
458         interveningTokens++;
459       }
460       currentPosition++;
461       currentOffset = ((Annotation) tokens[currentPosition])
462   .getStartNode().getOffset().longValue();
463     }
464     return interveningTokens;
465   }
466 
467   /** Get the next token after an annotation */
468   private Annotation getFollowingToken(Annotation current, int currentPosition,
469                Object[] tokens) {
470     long endOffset = current.getEndNode().getOffset().longValue();
471     long currentOffset = ((Annotation) tokens[currentPosition])
472       .getStartNode().getOffset().longValue();
473     while (currentOffset < endOffset) {
474       currentPosition++;
475       currentOffset = ((Annotation) tokens[currentPosition])
476   .getStartNode().getOffset().longValue();
477     }
478     return (Annotation) tokens[currentPosition];
479   }
480   
481   /** Get the text of an annotation */
482   private String stringValue(Annotation ann) {
483     Object[] tokens = getSortedTokens(ann);
484   
485     StringBuffer output = new StringBuffer();
486     for (int i=0;i<tokens.length;i++) {
487       Annotation token = (Annotation) tokens[i];
488       output.append(token.getFeatures().get(TOKEN_STRING_FEATURE_NAME));
489       if (i < tokens.length - 1) {
490         output.append(" ");
491       }
492     }
493     return output.toString();
494   }
495     
496   /** Get a sorted array of the tokens that make up a given annotation. */
497   private Object[] getSortedTokens(Annotation a) {
498     Object[] annotationTokens =
499       this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,
500           a.getStartNode().getOffset(),
501           a.getEndNode().getOffset()).toArray();
502     java.util.Arrays.sort(annotationTokens, new OffsetComparator());
503     
504     return annotationTokens;
505   }
506   
507   /** --- */
508   public HashMap getResolvedAnaphora() {
509     return this.anaphor2antecedent;
510   }
511 
512   /** --- */
513   private void preprocess() throws ExecutionException {
514 
515     //0.5 cleanup
516     this.anaphor2antecedent.clear();
517 
518     //1.get all annotation in the input set
519     if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
520       this.defaultAnnotations = this.document.getAnnotations();
521     }
522     else {
523       this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
524     }
525 
526     //if none found, print warning and exit
527     if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
528       Err.prln("Coref Warning: No annotations found for processing!");
529       return;
530     }
531 
532     /*
533     // initialise the quoted text fragments
534     AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
535 
536     //if none then return
537     if (null == sentQuotes) {
538       this.quotedText = new Quote[0];
539     }
540     else {
541       this.quotedText = new Quote[sentQuotes.size()];
542 
543       Object[] quotesArray = sentQuotes.toArray();
544       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
545 
546       for (int i =0; i < quotesArray.length; i++) {
547         this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
548       }
549     }
550     */
551   }
552 
553 }
554