1   /*
2    *  NominalCoref.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  $Id: NominalCoref.java,v 1.13 2004/07/21 17:10:04 akshay Exp $
12   */
13  
14  package gate.creole.coref;
15  
16  import java.util.*;
17  
18  import gate.*;
19  import gate.creole.*;
20  import gate.util.*;
21  
22  public class NominalCoref extends AbstractCoreferencer
23      implements ProcessingResource, ANNIEConstants {
24  
25    public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
26  
27    public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
28  
29    /** --- */
30    private static final boolean DEBUG = false;
31  
32    //annotation features
33    private static final String PERSON_CATEGORY = "Person";
34    private static final String JOBTITLE_CATEGORY = "JobTitle";
35    private static final String ORGANIZATION_CATEGORY = "Organization";
36    private static final String LOOKUP_CATEGORY = "Lookup";
37    private static final String ORGANIZATION_NOUN_CATEGORY = "organization_noun";
38    
39  
40    //scope
41    /** --- */
42    //private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
43    /** --- */
44    private String annotationSetName;
45    /** --- */
46    private AnnotationSet defaultAnnotations;
47    /** --- */
48    private HashMap anaphor2antecedent;
49  
50      /*  static {
51      ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
52      }*/
53  
54    /** --- */
55    public NominalCoref() {
56      super("NOMINAL");
57      this.anaphor2antecedent = new HashMap();
58    }
59  
60    /** Initialise this resource, and return it. */
61    public Resource init() throws ResourceInstantiationException {
62      return super.init();
63    } // init()
64  
65    /**
66     * Reinitialises the processing resource. After calling this method the
67     * resource should be in the state it is after calling init.
68     * If the resource depends on external resources (such as rules files) then
69     * the resource will re-read those resources. If the data used to create
70     * the resource has changed since the resource has been created then the
71     * resource will change too after calling reInit().
72    */
73    public void reInit() throws ResourceInstantiationException {
74      this.anaphor2antecedent = new HashMap();
75      init();
76    } // reInit()
77  
78  
79    /** Set the document to run on. */
80    public void setDocument(Document newDocument) {
81  
82      //0. precondition
83  //    Assert.assertNotNull(newDocument);
84  
85      super.setDocument(newDocument);
86    }
87  
88    /** --- */
89    public void setAnnotationSetName(String annotationSetName) {
90      this.annotationSetName = annotationSetName;
91    }
92  
93    /** --- */
94    public String getAnnotationSetName() {
95      return annotationSetName;
96    }
97  
98    /**
99     * This method runs the coreferencer. It assumes that all the needed parameters
100    * are set. If they are not, an exception will be fired.
101    *
102    * The process goes like this:
103    * - Create a sorted list of Person and JobTitle annotations.
104    * - Loop through the annotations
105    *    If it is a Person, we add it to the top of a stack.
106    *    If it is a job title, we subject it to a series of tests. If it 
107    *      passes, we associate it with the Person annotation at the top
108    *      of the stack
109    */
110   public void execute() throws ExecutionException{
111 
112     HashMap anaphorToAntecedent = new HashMap();
113     Object[] nominalArray;
114 
115     //0. preconditions
116     if (null == this.document) {
117       throw new ExecutionException("[coreference] Document is not set!");
118     }
119 
120     //1. preprocess
121     preprocess();
122 
123     // Out.println("Total annotations: " + defaultAnnotations.size());
124 
125     // Get a sorted array of Tokens.
126     // The tests for job titles often require getting previous and subsequent
127     // tokens, so to save work, we create a single, sorted list of 
128     // tokens.
129     Object[] tokens = defaultAnnotations.get(TOKEN_ANNOTATION_TYPE).toArray();
130     java.util.Arrays.sort(tokens, new OffsetComparator());
131 
132     // The current token is the token at the start of the current annotation.
133     int currentToken = 0;
134 
135     // get Person entities
136     //FeatureMap personConstraint = new SimpleFeatureMapImpl();
137     //personConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
138     //                          PERSON_CATEGORY);
139     HashSet personConstraint = new HashSet();
140     personConstraint.add(PERSON_CATEGORY);
141     AnnotationSet people =
142       this.defaultAnnotations.get(personConstraint);
143 
144     // get all JobTitle entities
145     //FeatureMap constraintJobTitle = new SimpleFeatureMapImpl();
146     //constraintJobTitle.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, JOBTITLE_CATEGORY);
147     HashSet jobTitleConstraint = new HashSet();
148     jobTitleConstraint.add(JOBTITLE_CATEGORY);
149     
150     AnnotationSet jobTitles = 
151       this.defaultAnnotations.get(jobTitleConstraint);
152 
153     FeatureMap orgNounConstraint = new SimpleFeatureMapImpl();
154     orgNounConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
155                           ORGANIZATION_NOUN_CATEGORY);
156     AnnotationSet orgNouns =
157       this.defaultAnnotations.get(LOOKUP_CATEGORY, orgNounConstraint);
158 
159     HashSet orgConstraint = new HashSet();
160     orgConstraint.add(ORGANIZATION_CATEGORY);
161 
162     AnnotationSet organizations =
163       this.defaultAnnotations.get(orgConstraint);
164 
165     // combine them into a list of nominals
166     Set nominals = new HashSet();
167     if (people != null) {
168       nominals.addAll(people);
169     }
170     if (jobTitles != null) {
171       nominals.addAll(jobTitles);
172     }
173     if (orgNouns != null) {
174       nominals.addAll(orgNouns);
175     }
176     if (organizations != null) {
177       nominals.addAll(organizations);
178     }
179 
180     //  Out.println("total nominals: " + nominals.size());
181 
182     // sort them according to offset
183     nominalArray = nominals.toArray();
184     java.util.Arrays.sort(nominalArray, new OffsetComparator());
185     
186     ArrayList previousPeople = new ArrayList();
187     ArrayList previousOrgs = new ArrayList();
188     
189         
190     // process all nominals
191     for (int i=0; i<nominalArray.length; i++) {
192       Annotation nominal = (Annotation)nominalArray[i];
193       
194       // Find the current place in the tokens array
195       currentToken = advanceTokenPosition(nominal, currentToken, tokens);
196       
197       //Out.print("processing nominal [" + stringValue(nominal) + "] ");
198       
199       if (nominal.getType().equals(PERSON_CATEGORY)) {
200   // Add each Person entity to the beginning of the people list
201   // but don't add pronouns
202   Object[] personTokens = getSortedTokens(nominal);
203     
204   if (personTokens.length == 1) {
205     Annotation personToken = (Annotation) personTokens[0];
206     
207     String personCategory = (String) 
208       personToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
209     if (personCategory.equals("PP") ||
210         personCategory.equals("PRP") ||
211         personCategory.equals("PRP$") ||
212         personCategory.equals("PRPR$")) {
213         //Out.println("ignoring personal pronoun");
214         continue;
215     }
216   }
217   
218   previousPeople.add(0, nominal);
219   //Out.println("added person");
220       }
221       else if (nominal.getType().equals(JOBTITLE_CATEGORY)) {
222     
223   // Look into the tokens to get some info about POS.
224   Object[] jobTitleTokens = getSortedTokens(nominal);
225   
226   Annotation lastToken = (Annotation)
227     jobTitleTokens[jobTitleTokens.length - 1];
228   
229   // Don't associate if the job title is not a singular noun
230   String tokenCategory = (String) 
231     lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
232   // UNCOMMENT FOR SINGULAR PROPER NOUNS (The President, the Pope)
233   //if (! tokenCategory.equals("NN") &&
234   //! tokenCategory.equals("NNP")) {
235   if (! tokenCategory.equals("NN")) {
236       // Out.println("Not a singular noun");
237     continue;
238   }
239   
240   // Don't associate it if it's part of a Person (eg President Bush)
241   if (overlapsAnnotations(nominal, people)) {
242       //Out.println("overlapping annotation");
243     continue;
244   }
245 
246   Annotation previousToken;
247         String previousValue;
248 
249   // Don't associate it if it's proceeded by a generic marker
250         if (currentToken != 0) {
251           previousToken = (Annotation) tokens[currentToken - 1];
252           previousValue = (String) 
253       previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
254           if (previousValue.equalsIgnoreCase("a") ||
255               previousValue.equalsIgnoreCase("an") ||
256               previousValue.equalsIgnoreCase("other") ||
257               previousValue.equalsIgnoreCase("another")) {
258               //Out.println("indefinite");
259       continue;
260           }
261         }
262 
263   // nominals immediately followed by Person annotations:
264   // BAD:
265   //   Chairman Bill Gates               (title)
266   // GOOD:
267   //   secretary of state, Colin Powell  (inverted appositive)
268   //   the home secretary David Blunkett (same but no comma, 
269   //                                      possible in transcriptions)
270   // "the" is a good indicator for apposition
271   
272   // Luckily we have an array of all Person annotations in order...
273   if (i < nominalArray.length - 1) {
274     Annotation nextAnnotation = (Annotation) nominalArray[i+1];
275     if (nextAnnotation.getType().equals(PERSON_CATEGORY)) {
276       // is it preceded by a definite article?
277       previousToken = (Annotation) tokens[currentToken - 1];
278       previousValue = (String) 
279         previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
280       
281       // Get all tokens between this and the next person
282       int interveningTokens =
283         countInterveningTokens(nominal, nextAnnotation,
284              currentToken, tokens);
285       if (interveningTokens == 0 && 
286         ! previousValue.equalsIgnoreCase("the")) {
287       
288         // There is nothing between the job title and the person,
289         // like "Chairman Gates" -- do nothing.
290         //Out.println("immediately followed by Person");
291         continue;
292       }
293       else if (interveningTokens == 1) {
294         String tokenString =
295           (String) getFollowingToken(nominal,
296              currentToken, tokens)
297       .getFeatures().get(TOKEN_STRING_FEATURE_NAME);
298         //Out.print("STRING VALUE [" + tokenString + "] ");
299         if (! tokenString.equals(",") &&
300     ! tokenString.equals("-")) {
301     //Out.println("nominal and person separated by NOT [,-]");
302     continue;
303         }
304       }
305       
306       // Did we get through all that? Then we must have an 
307       // apposition.
308       
309       anaphor2antecedent.put(nominal, nextAnnotation);
310       //Out.println("associating with " +
311       //  stringValue(nextAnnotation));
312       continue;
313       
314     }
315   }
316   
317   // If we have no possible antecedents, create a new Person
318   // annotation.
319   if (previousPeople.size() == 0) {
320     FeatureMap personFeatures = new SimpleFeatureMapImpl();
321     personFeatures.put("ENTITY_MENTION_TYPE", "NOMINAL");
322     this.defaultAnnotations.add(nominal.getStartNode(),
323               nominal.getEndNode(),
324               PERSON_CATEGORY,
325               personFeatures);
326     //Out.println("creating as new Person");
327     continue;
328   }
329 
330   // Associate this entity with the most recent Person
331   int personIndex = 0;
332   
333   Annotation previousPerson =
334     (Annotation) previousPeople.get(personIndex);
335   
336   // Don't associate if the two nominals are not the same gender
337   String personGender = (String) 
338     previousPerson.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
339   String jobTitleGender = (String) 
340           nominal.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
341   if (personGender != null && jobTitleGender != null) {
342           if (! personGender.equals(jobTitleGender)) {
343             //Out.println("wrong gender: " + personGender + " " +
344             //            jobTitleGender);
345       continue;
346     }
347   }
348   
349   //Out.println("associating with " +
350   //  previousPerson.getFeatures()
351   //  .get(TOKEN_STRING_FEATURE_NAME));
352   
353   anaphor2antecedent.put(nominal, previousPerson);
354       }
355       else if (nominal.getType().equals(ORGANIZATION_CATEGORY)) {
356         // Add each organization entity to the beginning of
357   // the organization list
358   previousOrgs.add(0, nominal);
359   //Out.println("added organization");
360       }
361       else if (nominal.getType().equals(LOOKUP_CATEGORY)) {
362   // Don't associate it if we have no organizations
363   if (previousOrgs.size() == 0) {
364     //Out.println("no orgs");
365     continue;
366   }
367     
368   // Look into the tokens to get some info about POS.
369   Object[] orgNounTokens =
370     this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,
371               nominal.getStartNode().getOffset(),
372               nominal.getEndNode().getOffset()).toArray();
373   java.util.Arrays.sort(orgNounTokens, new OffsetComparator());
374   Annotation lastToken = (Annotation)
375     orgNounTokens[orgNounTokens.length - 1];
376   
377   // Don't associate if the org noun is not a singular noun
378   if (! lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)
379       .equals("NN")) {
380       //Out.println("Not a singular noun");
381       continue;
382   }
383   
384   //Out.println("organization noun");
385   // Associate this entity with the most recent Person
386   anaphor2antecedent.put(nominal, previousOrgs.get(0));
387       }
388     }
389 
390     // This method does the dirty work of actually adding new annotations and
391     // coreferring.
392     generateCorefChains(anaphor2antecedent);
393   }
394 
395   /**
396    * This method specifies whether a given annotation overlaps any of a 
397    * set of annotations. For instance, JobTitles occasionally are
398    * part of Person annotations.
399    * 
400    */
401   private boolean overlapsAnnotations(Annotation a,
402                                       AnnotationSet annotations) {
403     Iterator iter = annotations.iterator();
404     while (iter.hasNext()) {
405       Annotation current = (Annotation) iter.next();
406       if (a.overlaps(current)) {
407         return true;
408       }
409     }
410       
411     return false;
412   }
413 
414   /** Use this method to keep the current token pointer at the right point
415    * in the token list */
416   private int advanceTokenPosition(Annotation target, int currentPosition,
417            Object[] tokens) {
418     long targetOffset = target.getStartNode().getOffset().longValue();
419     long currentOffset = ((Annotation) tokens[currentPosition])
420       .getStartNode().getOffset().longValue();
421     
422     if (targetOffset > currentOffset) {
423       while (targetOffset > currentOffset) {
424   currentPosition++;
425   currentOffset = ((Annotation) tokens[currentPosition])
426           .getStartNode().getOffset().longValue();
427       }
428     }
429     else if (targetOffset < currentOffset) {
430       while (targetOffset < currentOffset) {
431   currentPosition--;
432   currentOffset = ((Annotation) tokens[currentPosition])
433           .getStartNode().getOffset().longValue();
434       }
435     }
436     
437     return currentPosition;
438   }
439 
440   /** Return the number of tokens between the end of annotation 1 and the
441    * beginning of annotation 2. Will return 0 if they are not in order */
442   private int countInterveningTokens(Annotation first, Annotation second,
443              int currentPosition, Object[] tokens) {
444     int interveningTokens = 0;
445 
446     long startOffset = first.getEndNode().getOffset().longValue();
447     long endOffset = second.getStartNode().getOffset().longValue();
448     
449     long currentOffset = ((Annotation) tokens[currentPosition])
450       .getStartNode().getOffset().longValue();
451     
452     while (currentOffset < endOffset) {
453       if (currentOffset >= startOffset) {
454         interveningTokens++;
455       }
456       currentPosition++;
457       currentOffset = ((Annotation) tokens[currentPosition])
458   .getStartNode().getOffset().longValue();
459     }
460     return interveningTokens;
461   }
462 
463   /** Get the next token after an annotation */
464   private Annotation getFollowingToken(Annotation current, int currentPosition,
465                Object[] tokens) {
466     long endOffset = current.getEndNode().getOffset().longValue();
467     long currentOffset = ((Annotation) tokens[currentPosition])
468       .getStartNode().getOffset().longValue();
469     while (currentOffset < endOffset) {
470       currentPosition++;
471       currentOffset = ((Annotation) tokens[currentPosition])
472   .getStartNode().getOffset().longValue();
473     }
474     return (Annotation) tokens[currentPosition];
475   }
476   
477   /** Get the text of an annotation */
478   private String stringValue(Annotation ann) {
479     Object[] tokens = getSortedTokens(ann);
480   
481     StringBuffer output = new StringBuffer();
482     for (int i=0;i<tokens.length;i++) {
483       Annotation token = (Annotation) tokens[i];
484       output.append(token.getFeatures().get(TOKEN_STRING_FEATURE_NAME));
485       if (i < tokens.length - 1) {
486         output.append(" ");
487       }
488     }
489     return output.toString();
490   }
491     
492   /** Get a sorted array of the tokens that make up a given annotation. */
493   private Object[] getSortedTokens(Annotation a) {
494     Object[] annotationTokens =
495       this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,
496           a.getStartNode().getOffset(),
497           a.getEndNode().getOffset()).toArray();
498     java.util.Arrays.sort(annotationTokens, new OffsetComparator());
499     
500     return annotationTokens;
501   }
502   
503   /** --- */
504   public HashMap getResolvedAnaphora() {
505     return this.anaphor2antecedent;
506   }
507 
508   /** --- */
509   private void preprocess() throws ExecutionException {
510 
511     //0.5 cleanup
512     this.anaphor2antecedent.clear();
513 
514     //1.get all annotation in the input set
515     if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
516       this.defaultAnnotations = this.document.getAnnotations();
517     }
518     else {
519       this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
520     }
521 
522     //if none found, print warning and exit
523     if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
524       Err.prln("Coref Warning: No annotations found for processing!");
525       return;
526     }
527 
528     /*
529     // initialise the quoted text fragments
530     AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
531 
532     //if none then return
533     if (null == sentQuotes) {
534       this.quotedText = new Quote[0];
535     }
536     else {
537       this.quotedText = new Quote[sentQuotes.size()];
538 
539       Object[] quotesArray = sentQuotes.toArray();
540       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
541 
542       for (int i =0; i < quotesArray.length; i++) {
543         this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
544       }
545     }
546     */
547   }
548 
549 }
550