|
NominalCoref |
|
1 /* 2 * NominalCoref.java 3 * 4 * Copyright (c) 1998-2002, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * $Id: NominalCoref.java,v 1.11 2003/01/27 17:12:31 diana Exp $ 12 */ 13 14 package gate.creole.coref; 15 16 import java.util.*; 17 import java.net.*; 18 19 import junit.framework.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.annotation.*; 25 26 public class NominalCoref extends AbstractCoreferencer 27 implements ProcessingResource, ANNIEConstants { 28 29 public static final String COREF_DOCUMENT_PARAMETER_NAME = "document"; 30 31 public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName"; 32 33 /** --- */ 34 private static final boolean DEBUG = false; 35 36 //annotation features 37 private static final String PERSON_CATEGORY = "Person"; 38 private static final String JOBTITLE_CATEGORY = "JobTitle"; 39 private static final String ORGANIZATION_CATEGORY = "Organization"; 40 private static final String LOOKUP_CATEGORY = "Lookup"; 41 private static final String ORGANIZATION_NOUN_CATEGORY = "organization_noun"; 42 43 44 //scope 45 /** --- */ 46 //private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR; 47 /** --- */ 48 private String annotationSetName; 49 /** --- */ 50 private AnnotationSet defaultAnnotations; 51 /** --- */ 52 private HashMap anaphor2antecedent; 53 54 /* static { 55 ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator(); 56 }*/ 57 58 /** --- */ 59 public NominalCoref() { 60 super("NOMINAL"); 61 this.anaphor2antecedent = new HashMap(); 62 } 63 64 /** Initialise this resource, and return it. */ 65 public Resource init() throws ResourceInstantiationException { 66 return super.init(); 67 } // init() 68 69 /** 70 * Reinitialises the processing resource. After calling this method the 71 * resource should be in the state it is after calling init. 72 * If the resource depends on external resources (such as rules files) then 73 * the resource will re-read those resources. If the data used to create 74 * the resource has changed since the resource has been created then the 75 * resource will change too after calling reInit(). 76 */ 77 public void reInit() throws ResourceInstantiationException { 78 this.anaphor2antecedent = new HashMap(); 79 init(); 80 } // reInit() 81 82 83 /** Set the document to run on. */ 84 public void setDocument(Document newDocument) { 85 86 //0. precondition 87 // Assert.assertNotNull(newDocument); 88 89 super.setDocument(newDocument); 90 } 91 92 /** --- */ 93 public void setAnnotationSetName(String annotationSetName) { 94 this.annotationSetName = annotationSetName; 95 } 96 97 /** --- */ 98 public String getAnnotationSetName() { 99 return annotationSetName; 100 } 101 102 /** 103 * This method runs the coreferencer. It assumes that all the needed parameters 104 * are set. If they are not, an exception will be fired. 105 * 106 * The process goes like this: 107 * - Create a sorted list of Person and JobTitle annotations. 108 * - Loop through the annotations 109 * If it is a Person, we add it to the top of a stack. 110 * If it is a job title, we subject it to a series of tests. If it 111 * passes, we associate it with the Person annotation at the top 112 * of the stack 113 */ 114 public void execute() throws ExecutionException{ 115 116 HashMap anaphorToAntecedent = new HashMap(); 117 Object[] nominalArray; 118 119 //0. preconditions 120 if (null == this.document) { 121 throw new ExecutionException("[coreference] Document is not set!"); 122 } 123 124 //1. preprocess 125 preprocess(); 126 127 // Out.println("Total annotations: " + defaultAnnotations.size()); 128 129 // Get a sorted array of Tokens. 130 // The tests for job titles often require getting previous and subsequent 131 // tokens, so to save work, we create a single, sorted list of 132 // tokens. 133 Object[] tokens = defaultAnnotations.get(TOKEN_ANNOTATION_TYPE).toArray(); 134 java.util.Arrays.sort(tokens, new OffsetComparator()); 135 136 // The current token is the token at the start of the current annotation. 137 int currentToken = 0; 138 139 // get Person entities 140 //FeatureMap personConstraint = new SimpleFeatureMapImpl(); 141 //personConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, 142 // PERSON_CATEGORY); 143 HashSet personConstraint = new HashSet(); 144 personConstraint.add(PERSON_CATEGORY); 145 AnnotationSet people = 146 this.defaultAnnotations.get(personConstraint); 147 148 // get all JobTitle entities 149 //FeatureMap constraintJobTitle = new SimpleFeatureMapImpl(); 150 //constraintJobTitle.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, JOBTITLE_CATEGORY); 151 HashSet jobTitleConstraint = new HashSet(); 152 jobTitleConstraint.add(JOBTITLE_CATEGORY); 153 154 AnnotationSet jobTitles = 155 this.defaultAnnotations.get(jobTitleConstraint); 156 157 FeatureMap orgNounConstraint = new SimpleFeatureMapImpl(); 158 orgNounConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, 159 ORGANIZATION_NOUN_CATEGORY); 160 AnnotationSet orgNouns = 161 this.defaultAnnotations.get(LOOKUP_CATEGORY, orgNounConstraint); 162 163 HashSet orgConstraint = new HashSet(); 164 orgConstraint.add(ORGANIZATION_CATEGORY); 165 166 AnnotationSet organizations = 167 this.defaultAnnotations.get(orgConstraint); 168 169 // combine them into a list of nominals 170 Set nominals = new HashSet(); 171 if (people != null) { 172 nominals.addAll(people); 173 } 174 if (jobTitles != null) { 175 nominals.addAll(jobTitles); 176 } 177 if (orgNouns != null) { 178 nominals.addAll(orgNouns); 179 } 180 if (organizations != null) { 181 nominals.addAll(organizations); 182 } 183 184 // Out.println("total nominals: " + nominals.size()); 185 186 // sort them according to offset 187 nominalArray = nominals.toArray(); 188 java.util.Arrays.sort(nominalArray, new OffsetComparator()); 189 190 ArrayList previousPeople = new ArrayList(); 191 ArrayList previousOrgs = new ArrayList(); 192 193 194 // process all nominals 195 for (int i=0; i<nominalArray.length; i++) { 196 Annotation nominal = (Annotation)nominalArray[i]; 197 198 // Find the current place in the tokens array 199 currentToken = advanceTokenPosition(nominal, currentToken, tokens); 200 201 //Out.print("processing nominal [" + stringValue(nominal) + "] "); 202 203 if (nominal.getType().equals(PERSON_CATEGORY)) { 204 // Add each Person entity to the beginning of the people list 205 // but don't add pronouns 206 Object[] personTokens = getSortedTokens(nominal); 207 208 if (personTokens.length == 1) { 209 Annotation personToken = (Annotation) personTokens[0]; 210 211 String personCategory = (String) 212 personToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME); 213 if (personCategory.equals("PP") || 214 personCategory.equals("PRP") || 215 personCategory.equals("PRP$") || 216 personCategory.equals("PRPR$")) { 217 //Out.println("ignoring personal pronoun"); 218 continue; 219 } 220 } 221 222 previousPeople.add(0, nominal); 223 //Out.println("added person"); 224 } 225 else if (nominal.getType().equals(JOBTITLE_CATEGORY)) { 226 227 // Look into the tokens to get some info about POS. 228 Object[] jobTitleTokens = getSortedTokens(nominal); 229 230 Annotation lastToken = (Annotation) 231 jobTitleTokens[jobTitleTokens.length - 1]; 232 233 // Don't associate if the job title is not a singular noun 234 String tokenCategory = (String) 235 lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME); 236 // UNCOMMENT FOR SINGULAR PROPER NOUNS (The President, the Pope) 237 //if (! tokenCategory.equals("NN") && 238 //! tokenCategory.equals("NNP")) { 239 if (! tokenCategory.equals("NN")) { 240 // Out.println("Not a singular noun"); 241 continue; 242 } 243 244 // Don't associate it if it's part of a Person (eg President Bush) 245 if (overlapsAnnotations(nominal, people)) { 246 //Out.println("overlapping annotation"); 247 continue; 248 } 249 250 Annotation previousToken; 251 String previousValue; 252 253 // Don't associate it if it's proceeded by a generic marker 254 if (currentToken != 0) { 255 previousToken = (Annotation) tokens[currentToken - 1]; 256 previousValue = (String) 257 previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 258 if (previousValue.equalsIgnoreCase("a") || 259 previousValue.equalsIgnoreCase("an") || 260 previousValue.equalsIgnoreCase("other") || 261 previousValue.equalsIgnoreCase("another")) { 262 //Out.println("indefinite"); 263 continue; 264 } 265 } 266 267 // nominals immediately followed by Person annotations: 268 // BAD: 269 // Chairman Bill Gates (title) 270 // GOOD: 271 // secretary of state, Colin Powell (inverted appositive) 272 // the home secretary David Blunkett (same but no comma, 273 // possible in transcriptions) 274 // "the" is a good indicator for apposition 275 276 // Luckily we have an array of all Person annotations in order... 277 if (i < nominalArray.length - 1) { 278 Annotation nextAnnotation = (Annotation) nominalArray[i+1]; 279 if (nextAnnotation.getType().equals(PERSON_CATEGORY)) { 280 // is it preceded by a definite article? 281 previousToken = (Annotation) tokens[currentToken - 1]; 282 previousValue = (String) 283 previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 284 285 // Get all tokens between this and the next person 286 int interveningTokens = 287 countInterveningTokens(nominal, nextAnnotation, 288 currentToken, tokens); 289 if (interveningTokens == 0 && 290 ! previousValue.equalsIgnoreCase("the")) { 291 292 // There is nothing between the job title and the person, 293 // like "Chairman Gates" -- do nothing. 294 //Out.println("immediately followed by Person"); 295 continue; 296 } 297 else if (interveningTokens == 1) { 298 String tokenString = 299 (String) getFollowingToken(nominal, 300 currentToken, tokens) 301 .getFeatures().get(TOKEN_STRING_FEATURE_NAME); 302 //Out.print("STRING VALUE [" + tokenString + "] "); 303 if (! tokenString.equals(",") && 304 ! tokenString.equals("-")) { 305 //Out.println("nominal and person separated by NOT [,-]"); 306 continue; 307 } 308 } 309 310 // Did we get through all that? Then we must have an 311 // apposition. 312 313 anaphor2antecedent.put(nominal, nextAnnotation); 314 //Out.println("associating with " + 315 // stringValue(nextAnnotation)); 316 continue; 317 318 } 319 } 320 321 // If we have no possible antecedents, create a new Person 322 // annotation. 323 if (previousPeople.size() == 0) { 324 FeatureMap personFeatures = new SimpleFeatureMapImpl(); 325 personFeatures.put("ENTITY_MENTION_TYPE", "NOMINAL"); 326 this.defaultAnnotations.add(nominal.getStartNode(), 327 nominal.getEndNode(), 328 PERSON_CATEGORY, 329 personFeatures); 330 //Out.println("creating as new Person"); 331 continue; 332 } 333 334 // Associate this entity with the most recent Person 335 int personIndex = 0; 336 337 Annotation previousPerson = 338 (Annotation) previousPeople.get(personIndex); 339 340 // Don't associate if the two nominals are not the same gender 341 String personGender = (String) 342 previousPerson.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 343 String jobTitleGender = (String) 344 nominal.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 345 if (personGender != null && jobTitleGender != null) { 346 if (! personGender.equals(jobTitleGender)) { 347 //Out.println("wrong gender: " + personGender + " " + 348 // jobTitleGender); 349 continue; 350 } 351 } 352 353 //Out.println("associating with " + 354 // previousPerson.getFeatures() 355 // .get(TOKEN_STRING_FEATURE_NAME)); 356 357 anaphor2antecedent.put(nominal, previousPerson); 358 } 359 else if (nominal.getType().equals(ORGANIZATION_CATEGORY)) { 360 // Add each organization entity to the beginning of 361 // the organization list 362 previousOrgs.add(0, nominal); 363 //Out.println("added organization"); 364 } 365 else if (nominal.getType().equals(LOOKUP_CATEGORY)) { 366 // Don't associate it if we have no organizations 367 if (previousOrgs.size() == 0) { 368 //Out.println("no orgs"); 369 continue; 370 } 371 372 // Look into the tokens to get some info about POS. 373 Object[] orgNounTokens = 374 this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE, 375 nominal.getStartNode().getOffset(), 376 nominal.getEndNode().getOffset()).toArray(); 377 java.util.Arrays.sort(orgNounTokens, new OffsetComparator()); 378 Annotation lastToken = (Annotation) 379 orgNounTokens[orgNounTokens.length - 1]; 380 381 // Don't associate if the org noun is not a singular noun 382 if (! lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME) 383 .equals("NN")) { 384 //Out.println("Not a singular noun"); 385 continue; 386 } 387 388 //Out.println("organization noun"); 389 // Associate this entity with the most recent Person 390 anaphor2antecedent.put(nominal, previousOrgs.get(0)); 391 } 392 } 393 394 // This method does the dirty work of actually adding new annotations and 395 // coreferring. 396 generateCorefChains(anaphor2antecedent); 397 } 398 399 /** 400 * This method specifies whether a given annotation overlaps any of a 401 * set of annotations. For instance, JobTitles occasionally are 402 * part of Person annotations. 403 * 404 */ 405 private boolean overlapsAnnotations(Annotation a, 406 AnnotationSet annotations) { 407 Iterator iter = annotations.iterator(); 408 while (iter.hasNext()) { 409 Annotation current = (Annotation) iter.next(); 410 if (a.overlaps(current)) { 411 return true; 412 } 413 } 414 415 return false; 416 } 417 418 /** Use this method to keep the current token pointer at the right point 419 * in the token list */ 420 private int advanceTokenPosition(Annotation target, int currentPosition, 421 Object[] tokens) { 422 long targetOffset = target.getStartNode().getOffset().longValue(); 423 long currentOffset = ((Annotation) tokens[currentPosition]) 424 .getStartNode().getOffset().longValue(); 425 426 if (targetOffset > currentOffset) { 427 while (targetOffset > currentOffset) { 428 currentPosition++; 429 currentOffset = ((Annotation) tokens[currentPosition]) 430 .getStartNode().getOffset().longValue(); 431 } 432 } 433 else if (targetOffset < currentOffset) { 434 while (targetOffset < currentOffset) { 435 currentPosition--; 436 currentOffset = ((Annotation) tokens[currentPosition]) 437 .getStartNode().getOffset().longValue(); 438 } 439 } 440 441 return currentPosition; 442 } 443 444 /** Return the number of tokens between the end of annotation 1 and the 445 * beginning of annotation 2. Will return 0 if they are not in order */ 446 private int countInterveningTokens(Annotation first, Annotation second, 447 int currentPosition, Object[] tokens) { 448 int interveningTokens = 0; 449 450 long startOffset = first.getEndNode().getOffset().longValue(); 451 long endOffset = second.getStartNode().getOffset().longValue(); 452 453 long currentOffset = ((Annotation) tokens[currentPosition]) 454 .getStartNode().getOffset().longValue(); 455 456 while (currentOffset < endOffset) { 457 if (currentOffset >= startOffset) { 458 interveningTokens++; 459 } 460 currentPosition++; 461 currentOffset = ((Annotation) tokens[currentPosition]) 462 .getStartNode().getOffset().longValue(); 463 } 464 return interveningTokens; 465 } 466 467 /** Get the next token after an annotation */ 468 private Annotation getFollowingToken(Annotation current, int currentPosition, 469 Object[] tokens) { 470 long endOffset = current.getEndNode().getOffset().longValue(); 471 long currentOffset = ((Annotation) tokens[currentPosition]) 472 .getStartNode().getOffset().longValue(); 473 while (currentOffset < endOffset) { 474 currentPosition++; 475 currentOffset = ((Annotation) tokens[currentPosition]) 476 .getStartNode().getOffset().longValue(); 477 } 478 return (Annotation) tokens[currentPosition]; 479 } 480 481 /** Get the text of an annotation */ 482 private String stringValue(Annotation ann) { 483 Object[] tokens = getSortedTokens(ann); 484 485 StringBuffer output = new StringBuffer(); 486 for (int i=0;i<tokens.length;i++) { 487 Annotation token = (Annotation) tokens[i]; 488 output.append(token.getFeatures().get(TOKEN_STRING_FEATURE_NAME)); 489 if (i < tokens.length - 1) { 490 output.append(" "); 491 } 492 } 493 return output.toString(); 494 } 495 496 /** Get a sorted array of the tokens that make up a given annotation. */ 497 private Object[] getSortedTokens(Annotation a) { 498 Object[] annotationTokens = 499 this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE, 500 a.getStartNode().getOffset(), 501 a.getEndNode().getOffset()).toArray(); 502 java.util.Arrays.sort(annotationTokens, new OffsetComparator()); 503 504 return annotationTokens; 505 } 506 507 /** --- */ 508 public HashMap getResolvedAnaphora() { 509 return this.anaphor2antecedent; 510 } 511 512 /** --- */ 513 private void preprocess() throws ExecutionException { 514 515 //0.5 cleanup 516 this.anaphor2antecedent.clear(); 517 518 //1.get all annotation in the input set 519 if ( this.annotationSetName == null || this.annotationSetName.equals("")) { 520 this.defaultAnnotations = this.document.getAnnotations(); 521 } 522 else { 523 this.defaultAnnotations = this.document.getAnnotations(annotationSetName); 524 } 525 526 //if none found, print warning and exit 527 if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) { 528 Err.prln("Coref Warning: No annotations found for processing!"); 529 return; 530 } 531 532 /* 533 // initialise the quoted text fragments 534 AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE); 535 536 //if none then return 537 if (null == sentQuotes) { 538 this.quotedText = new Quote[0]; 539 } 540 else { 541 this.quotedText = new Quote[sentQuotes.size()]; 542 543 Object[] quotesArray = sentQuotes.toArray(); 544 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR); 545 546 for (int i =0; i < quotesArray.length; i++) { 547 this.quotedText[i] = new Quote((Annotation)quotesArray[i],i); 548 } 549 } 550 */ 551 } 552 553 } 554
|
NominalCoref |
|