|
CorpusBenchmarkTool |
|
1 /* 2 * CorpusBenchmarkTool.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 24/Oct/2001 12 * 13 * $Id: CorpusBenchmarkTool.java,v 1.27 2002/07/02 17:50:44 kalina Exp $ 14 */ 15 16 package gate.util; 17 18 import java.util.*; 19 import java.io.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.persist.*; 25 import gate.creole.tokeniser.*; 26 import gate.creole.gazetteer.*; 27 import gate.creole.splitter.*; 28 import gate.creole.orthomatcher.*; 29 import gate.creole.annotransfer.*; 30 import gate.annotation.*; 31 32 public class CorpusBenchmarkTool { 33 private static final String MARKED_DIR_NAME = "marked"; 34 private static final String CLEAN_DIR_NAME = "clean"; 35 private static final String CVS_DIR_NAME = "Cvs"; 36 private static final String PROCESSED_DIR_NAME = "processed"; 37 38 private static final boolean DEBUG = true; 39 40 public CorpusBenchmarkTool() {} 41 42 public void initPRs() { 43 try { 44 if (applicationFile == null) 45 Out.prln("Application not set!"); 46 Out.prln("App file is: " + applicationFile.getAbsolutePath()); 47 application = (Controller) gate.util.persistence.PersistenceManager 48 .loadObjectFromFile(applicationFile); 49 } catch (Exception ex) { 50 throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage()); 51 } 52 }//initPRs 53 54 public void unloadPRs() { 55 //we have nothing to unload if no PRs are loaded 56 if (isMarkedStored) 57 return; 58 59 } 60 61 public void execute() { 62 execute(startDir); 63 if (application != null) { 64 Iterator iter = new ArrayList(application.getPRs()).iterator(); 65 while (iter.hasNext()) 66 Factory.deleteResource((Resource) iter.next()); 67 Factory.deleteResource(application); 68 } 69 } 70 71 public void init() { 72 //first read the corpus_tool.properties file 73 File propFile = new File("corpus_tool.properties"); 74 Out.prln(propFile.getAbsolutePath()); 75 if (propFile.exists()) { 76 try { 77 InputStream inputStream = new FileInputStream(propFile); 78 this.configs.load(inputStream); 79 String thresholdString = this.configs.getProperty("threshold"); 80 if (thresholdString != null && !thresholdString.equals("")) { 81 this.threshold = (new Double(thresholdString)).doubleValue(); 82 Out.prln("New threshold is: " + this.threshold + "<P>\n"); 83 } 84 String setName = this.configs.getProperty("annotSetName"); 85 if (setName != null && !setName.equals("")) { 86 Out.prln("Annotation set in marked docs is: " + setName + " <P>\n"); 87 this.annotSetName = setName; 88 } 89 setName = this.configs.getProperty("outputSetName"); 90 if (setName != null && !setName.equals("")) { 91 Out.prln("Annotation set in processed docs is: " + setName + " <P>\n"); 92 this.outputSetName = setName; 93 } 94 String types = this.configs.getProperty("annotTypes"); 95 if (types != null && !types.equals("")) { 96 Out.prln("Using annotation types from the properties file. <P>\n"); 97 StringTokenizer strTok = new StringTokenizer(types, ";"); 98 annotTypes = new ArrayList(); 99 while (strTok.hasMoreTokens()) 100 annotTypes.add(strTok.nextToken()); 101 } else { 102 annotTypes = new ArrayList(); 103 annotTypes.add("Organization"); 104 annotTypes.add("Person"); 105 annotTypes.add("Date"); 106 annotTypes.add("Location"); 107 annotTypes.add("Address"); 108 annotTypes.add("Money"); 109 annotTypes.add("Percent"); 110 annotTypes.add("GPE"); 111 annotTypes.add("Facility"); 112 } 113 114 } catch (IOException ex) { 115 //just ignore the file and go on with the defaults 116 this.configs = new Properties(); 117 } 118 } else 119 this.configs = new Properties(); 120 121 122 //we only initialise the PRs if they are going to be used 123 //for processing unprocessed documents 124 if (!this.isMarkedStored) 125 initPRs(); 126 127 } 128 129 public void execute(File dir) { 130 if (dir == null) 131 return; 132 //first set the current directory to be the given one 133 currDir = dir; 134 Out.prln("Processing directory: " + currDir + "<P>"); 135 136 File processedDir = null; 137 File cleanDir = null; 138 File markedDir = null; 139 140 ArrayList subDirs = new ArrayList(); 141 File[] dirArray = currDir.listFiles(); 142 for (int i = 0; i < dirArray.length; i++) { 143 if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME)) 144 continue; 145 if (dirArray[i].getName().equals(CLEAN_DIR_NAME)) 146 cleanDir = dirArray[i]; 147 else if (dirArray[i].getName().equals(MARKED_DIR_NAME)) 148 markedDir = dirArray[i]; 149 else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME)) 150 processedDir = dirArray[i]; 151 else 152 subDirs.add(dirArray[i]); 153 } 154 155 if (this.isGenerateMode) 156 generateCorpus(cleanDir, processedDir); 157 else 158 evaluateCorpus(cleanDir, processedDir, markedDir); 159 160 //if no more subdirs left, return 161 if (subDirs.isEmpty()) 162 return; 163 164 //there are more subdirectories to traverse, so iterate through 165 for (int j = 0; j < subDirs.size(); j++) 166 execute((File) subDirs.get(j)); 167 168 }//execute(dir) 169 170 171 public static void main(String[] args) throws GateException { 172 Out.prln("<HTML>"); 173 Out.prln("<HEAD>"); 174 Out.prln("<TITLE> Corpus benchmark tool: ran with args " + 175 args.toString() + " on " + 176 new Date() + "</TITLE> </HEAD>"); 177 Out.prln("<BODY>"); 178 Out.prln("Please wait while GATE tools are initialised. <P>"); 179 // initialise GATE 180 Gate.init(); 181 182 CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool(); 183 184 List inputFiles = null; 185 if(args.length < 1) throw new GateException(usage); 186 int i = 0; 187 while (i < args.length && args[i].startsWith("-")) { 188 if(args[i].equals("-generate")) { 189 Out.prln("Generating the corpus... <P>"); 190 corpusTool.setGenerateMode(true); 191 } else if (args[i].equals("-marked_clean")) { 192 Out.prln("Evaluating current grammars against human-annotated...<P>"); 193 corpusTool.setMarkedClean(true); 194 } else if (args[i].equals("-marked_stored")) { 195 Out.prln("Evaluating stored documents against human-annotated...<P>"); 196 corpusTool.setMarkedStored(true); 197 } else if (args[i].equals("-marked_ds")) { 198 Out.prln("Looking for marked docs in a datastore...<P>"); 199 corpusTool.setMarkedDS(true); 200 } else if (args[i].equals("-verbose")) { 201 Out.prln("Running in verbose mode. Will generate annotation " + 202 "information when precision/recall are lower than " + 203 corpusTool.getThreshold() +"<P>"); 204 corpusTool.setVerboseMode(true); 205 } 206 i++; //just ignore the option, which we do not recognise 207 }//while 208 209 String dirName = args[i]; 210 File dir = new File(dirName); 211 if (!dir.isDirectory()) 212 throw new GateException(usage); 213 214 //get the last argument which is the application 215 i++; 216 String appName = args[i]; 217 File appFile = new File(appName); 218 if (!appFile.isFile()) 219 throw new GateException(usage); 220 else 221 corpusTool.setApplicationFile(appFile); 222 223 corpusTool.init(); 224 225 Out.prln("Measuring annotaitions of types: " + corpusTool.annotTypes + "<P>"); 226 227 corpusTool.setStartDirectory(dir); 228 corpusTool.execute(); 229 230 //if we're not generating the corpus, then print the precision and recall 231 //statistics for the processed corpus 232 if (! corpusTool.getGenerateMode()) 233 corpusTool.printStatistics(); 234 235 Out.prln("Finished! <P>"); 236 Out.prln("</BODY>"); 237 Out.prln("</HTML>"); 238 239 System.exit(0); 240 241 }//main 242 243 public void setGenerateMode(boolean mode) { 244 isGenerateMode = mode; 245 }//setGenerateMode 246 247 public boolean getGenerateMode() { 248 return isGenerateMode; 249 }//getGenerateMode 250 251 public boolean getVerboseMode() { 252 return isVerboseMode; 253 }//getVerboseMode 254 255 public void setVerboseMode(boolean mode) { 256 isVerboseMode = mode; 257 }//setVerboseMode 258 259 public void setMarkedStored(boolean mode) { 260 isMarkedStored = mode; 261 }// 262 263 public boolean getMarkedStored() { 264 return isMarkedStored; 265 }// 266 267 public void setMarkedClean(boolean mode) { 268 isMarkedClean = mode; 269 }// 270 271 public boolean getMarkedClean() { 272 return isMarkedClean; 273 }// 274 275 public void setMarkedDS(boolean mode) { 276 isMarkedDS = mode; 277 }// 278 279 public boolean getMarkedDS() { 280 return isMarkedDS; 281 }// 282 283 public void setApplicationFile(File newAppFile) { 284 applicationFile = newAppFile; 285 } 286 287 /** 288 * Returns the average precision over the entire set of processed documents. 289 * <P> 290 * If the tool has been evaluating the original documents against the 291 * previously-stored automatically annotated ones, then the precision 292 * will be the average precision on those two sets. <P> 293 * If the tool was run in -marked mode, i.e., was evaluating the stored 294 * automatically processed ones against the human-annotated ones, then 295 * the precision will be the average precision on those two sets of documents. 296 */ 297 public double getPrecisionAverage() { 298 return precisionSum/docNumber; 299 } 300 301 /** 302 * Returns the average recall over the entire set of processed documents. 303 * <P> 304 * If the tool has been evaluating the original documents against the 305 * previously-stored automatically annotated ones, then the recall 306 * will be the average recall on those two sets. <P> 307 * If the tool was run in -marked mode, i.e., was evaluating the stored 308 * automatically processed ones against the human-annotated ones, then 309 * the recall will be the average recall on those two sets of documents. 310 */ 311 public double getRecallAverage() { 312 return recallSum/docNumber; 313 } 314 315 public boolean isGenerateMode() { 316 return isGenerateMode == true; 317 }//isGenerateMode 318 319 public double getThreshold() { 320 return threshold; 321 } 322 323 public void setThreshold(double newValue) { 324 threshold = newValue; 325 } 326 327 public File getStartDirectory() { 328 return startDir; 329 }//getStartDirectory 330 331 public void setStartDirectory(File dir) { 332 startDir = dir; 333 }//setStartDirectory 334 335 protected void generateCorpus(File fileDir, File outputDir) { 336 //1. check if we have input files 337 if (fileDir == null) 338 return; 339 //2. create the output directory or clean it up if needed 340 File outDir = outputDir; 341 if (outputDir == null) { 342 outDir = new File(currDir, PROCESSED_DIR_NAME); 343 } else { 344 // get rid of the directory, coz datastore wants it clean 345 if (!Files.rmdir(outDir)) 346 Out.prln("cannot delete old output directory: " + outDir); 347 } 348 outDir.mkdir(); 349 350 //create the datastore and process each document 351 try { 352 SerialDataStore sds = new SerialDataStore(outDir.toURL().toString()); 353 sds.create(); 354 sds.open(); 355 356 File[] files = fileDir.listFiles(); 357 for (int i=0; i < files.length; i++) { 358 if (!files[i].isFile()) 359 continue; 360 // create a document 361 Out.prln("Processing and storing document: " + files[i].toURL() +"<P>"); 362 363 FeatureMap params = Factory.newFeatureMap(); 364 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL()); 365 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 366 367 // create the document 368 Document doc = (Document) Factory.createResource( 369 "gate.corpora.DocumentImpl", params 370 ); 371 372 doc.setName(files[i].getName()); 373 if (doc == null) 374 continue; 375 processDocument(doc); 376 LanguageResource lr = sds.adopt(doc, null); 377 sds.sync(lr); 378 Factory.deleteResource(doc); 379 Factory.deleteResource(lr); 380 }//for 381 sds.close(); 382 } catch (java.net.MalformedURLException ex) { 383 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 384 } catch (PersistenceException ex1) { 385 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 386 } catch (ResourceInstantiationException ex2) { 387 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 388 } catch (gate.security.SecurityException ex3) { 389 throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage()); 390 } 391 392 }//generateCorpus 393 394 protected void evaluateCorpus(File fileDir, 395 File processedDir, File markedDir) { 396 //1. check if we have input files and the processed Dir 397 if (fileDir == null || !fileDir.exists()) 398 return; 399 if (processedDir == null || !processedDir.exists()) 400 //if the user wants evaluation of marked and stored that's not possible 401 if (isMarkedStored) { 402 Out.prln("Cannot evaluate because no processed documents exist."); 403 return; 404 } 405 else 406 isMarkedClean = true; 407 408 //looked for marked texts only if the directory exists 409 boolean processMarked = markedDir != null && markedDir.exists(); 410 if (!processMarked && (isMarkedStored || isMarkedClean)) { 411 Out.prln("Cannot evaluate because no human-annotated documents exist."); 412 return; 413 } 414 415 if (isMarkedStored) { 416 evaluateMarkedStored(markedDir, processedDir); 417 return; 418 } else if (isMarkedClean) { 419 evaluateMarkedClean(markedDir, fileDir); 420 return; 421 } 422 423 Document persDoc = null; 424 Document cleanDoc = null; 425 Document markedDoc = null; 426 427 //open the datastore and process each document 428 try { 429 //open the data store 430 DataStore sds = Factory.openDataStore 431 ("gate.persist.SerialDataStore", 432 processedDir.toURL().toExternalForm()); 433 434 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl"); 435 for (int i=0; i < lrIDs.size(); i++) { 436 String docID = (String) lrIDs.get(i); 437 438 //read the stored document 439 FeatureMap features = Factory.newFeatureMap(); 440 features.put(DataStore.DATASTORE_FEATURE_NAME, sds); 441 features.put(DataStore.LR_ID_FEATURE_NAME, docID); 442 persDoc = (Document) Factory.createResource( 443 "gate.corpora.DocumentImpl", 444 features); 445 446 Out.prln("<H2>" + persDoc.getName() + "</H2>"); 447 448 File cleanDocFile = new File(fileDir, persDoc.getName()); 449 //try reading the original document from clean 450 if (! cleanDocFile.exists()) { 451 Out.prln("Warning: Cannot find original document " + 452 persDoc.getName() + " in " + fileDir); 453 } else { 454 FeatureMap params = Factory.newFeatureMap(); 455 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL()); 456 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 457 458 // create the document 459 cleanDoc = (Document) Factory.createResource( 460 "gate.corpora.DocumentImpl", params); 461 cleanDoc.setName(persDoc.getName()); 462 } 463 464 //try finding the marked document 465 StringBuffer docName = new StringBuffer(persDoc.getName()); 466 if (! isMarkedDS) { 467 docName.replace( 468 persDoc.getName().lastIndexOf("."), 469 docName.length(), 470 ".xml"); 471 File markedDocFile = new File(markedDir, docName.toString()); 472 if (! processMarked || ! markedDocFile.exists()) { 473 Out.prln("Warning: Cannot find human-annotated document " + 474 markedDocFile + " in " + markedDir); 475 } else { 476 FeatureMap params = Factory.newFeatureMap(); 477 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 478 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 479 480 // create the document 481 markedDoc = (Document) Factory.createResource( 482 "gate.corpora.DocumentImpl", params); 483 markedDoc.setName(persDoc.getName()); 484 } 485 } else { 486 //open marked from a DS 487 //open the data store 488 DataStore sds1 = Factory.openDataStore 489 ("gate.persist.SerialDataStore", 490 markedDir.toURL().toExternalForm()); 491 492 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl"); 493 boolean found = false; 494 int k = 0; 495 //search for the marked doc with the same name 496 while (k < lrIDs1.size() && !found) { 497 String docID1 = (String) lrIDs1.get(k); 498 499 //read the stored document 500 FeatureMap features1 = Factory.newFeatureMap(); 501 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1); 502 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1); 503 Document tempDoc = (Document) Factory.createResource( 504 "gate.corpora.DocumentImpl", 505 features1); 506 //check whether this is our doc 507 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")). 508 endsWith(persDoc.getName())) { 509 found = true; 510 markedDoc = tempDoc; 511 } else k++; 512 } 513 } 514 515 evaluateDocuments(persDoc, cleanDoc, markedDoc); 516 if (persDoc != null) 517 Factory.deleteResource(persDoc); 518 if (cleanDoc != null) 519 Factory.deleteResource(cleanDoc); 520 if (markedDoc != null) 521 Factory.deleteResource(markedDoc); 522 523 }//for loop through saved docs 524 sds.close(); 525 } catch (java.net.MalformedURLException ex) { 526 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 527 } catch (PersistenceException ex1) { 528 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 529 } catch (ResourceInstantiationException ex2) { 530 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 531 } 532 533 }//evaluateCorpus 534 535 protected void evaluateMarkedStored(File markedDir, File storedDir) { 536 Document persDoc = null; 537 Document cleanDoc = null; 538 Document markedDoc = null; 539 540 //open the datastore and process each document 541 try { 542 //open the data store 543 DataStore sds = Factory.openDataStore 544 ("gate.persist.SerialDataStore", 545 storedDir.toURL().toExternalForm()); 546 547 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl"); 548 for (int i=0; i < lrIDs.size(); i++) { 549 String docID = (String) lrIDs.get(i); 550 551 //read the stored document 552 FeatureMap features = Factory.newFeatureMap(); 553 features.put(DataStore.DATASTORE_FEATURE_NAME, sds); 554 features.put(DataStore.LR_ID_FEATURE_NAME, docID); 555 persDoc = (Document) Factory.createResource( 556 "gate.corpora.DocumentImpl", 557 features); 558 559 Out.prln("<H2>" + persDoc.getName() + "</H2>"); 560 561 if (! this.isMarkedDS) { //try finding the marked document as file 562 StringBuffer docName = new StringBuffer(persDoc.getName()); 563 docName.replace( 564 persDoc.getName().lastIndexOf("."), 565 docName.length(), 566 ".xml"); 567 File markedDocFile = new File(markedDir, docName.toString()); 568 if (! markedDocFile.exists()) { 569 Out.prln("Warning: Cannot find human-annotated document " + 570 markedDocFile + " in " + markedDir); 571 } else { 572 FeatureMap params = Factory.newFeatureMap(); 573 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 574 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 575 576 // create the document 577 markedDoc = (Document) Factory.createResource( 578 "gate.corpora.DocumentImpl", params); 579 markedDoc.setName(persDoc.getName()); 580 }//find marked as file 581 } else { 582 try { 583 //open marked from a DS 584 //open the data store 585 DataStore sds1 = Factory.openDataStore 586 ("gate.persist.SerialDataStore", 587 markedDir.toURL().toExternalForm()); 588 589 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl"); 590 boolean found = false; 591 int k = 0; 592 //search for the marked doc with the same name 593 while (k < lrIDs1.size() && !found) { 594 String docID1 = (String) lrIDs1.get(k); 595 596 //read the stored document 597 FeatureMap features1 = Factory.newFeatureMap(); 598 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1); 599 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1); 600 Document tempDoc = (Document) Factory.createResource( 601 "gate.corpora.DocumentImpl", 602 features1); 603 //check whether this is our doc 604 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")). 605 endsWith(persDoc.getName())) { 606 found = true; 607 markedDoc = tempDoc; 608 } else k++; 609 } 610 } catch (java.net.MalformedURLException ex) { 611 Out.prln("Error finding marked directory " + markedDir.getAbsolutePath()); 612 } catch (gate.persist.PersistenceException ex1) { 613 Out.prln("Error opening marked as a datastore (-marked_ds specified)"); 614 } catch (gate.creole.ResourceInstantiationException ex2) { 615 Out.prln("Error opening marked as a datastore (-marked_ds specified)"); 616 } 617 } 618 619 evaluateDocuments(persDoc, cleanDoc, markedDoc); 620 if (persDoc != null) 621 Factory.deleteResource(persDoc); 622 if (markedDoc != null) 623 Factory.deleteResource(markedDoc); 624 625 }//for loop through saved docs 626 sds.close(); 627 628 } catch (java.net.MalformedURLException ex) { 629 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 630 } catch (PersistenceException ex1) { 631 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 632 } catch (ResourceInstantiationException ex2) { 633 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 634 } 635 636 }//evaluateMarkedStored 637 638 639 protected void evaluateMarkedClean(File markedDir, File cleanDir) { 640 Document persDoc = null; 641 Document cleanDoc = null; 642 Document markedDoc = null; 643 644 File[] cleanDocs = cleanDir.listFiles(); 645 for (int i = 0; i< cleanDocs.length; i++) { 646 if (!cleanDocs[i].isFile()) 647 continue; 648 649 //try reading the original document from clean 650 FeatureMap params = Factory.newFeatureMap(); 651 try { 652 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL()); 653 } catch (java.net.MalformedURLException ex) { 654 Out.prln("Cannot create document from file: " + 655 cleanDocs[i].getAbsolutePath()); 656 continue; 657 } 658 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 659 660 // create the document 661 try { 662 cleanDoc = (Document) Factory.createResource( 663 "gate.corpora.DocumentImpl", params, 664 null, cleanDocs[i].getName()); 665 } catch (gate.creole.ResourceInstantiationException ex) { 666 Out.prln("Cannot create document from file: " + 667 cleanDocs[i].getAbsolutePath()); 668 continue; 669 } 670 671 Out.prln("<TD>" + cleanDocs[i].getName() + "</TD>"); 672 673 //try finding the marked document 674 if (! isMarkedDS) { 675 StringBuffer docName = new StringBuffer(cleanDoc.getName()); 676 docName.replace( 677 cleanDoc.getName().lastIndexOf("."), 678 docName.length(), 679 ".xml"); 680 File markedDocFile = new File(markedDir, docName.toString()); 681 if (! markedDocFile.exists()) { 682 Out.prln("Warning: Cannot find human-annotated document " + 683 markedDocFile + " in " + markedDir); 684 continue; 685 } else { 686 params = Factory.newFeatureMap(); 687 try { 688 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 689 } catch (java.net.MalformedURLException ex) { 690 Out.prln("Cannot create document from file: " + 691 markedDocFile.getAbsolutePath()); 692 continue; 693 } 694 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 695 696 // create the document 697 try { 698 markedDoc = (Document) Factory.createResource( 699 "gate.corpora.DocumentImpl", params, 700 null, cleanDoc.getName()); 701 } catch (gate.creole.ResourceInstantiationException ex) { 702 Out.prln("Cannot create document from file: " + 703 markedDocFile.getAbsolutePath()); 704 continue; 705 } 706 707 }//if markedDoc exists 708 } else { 709 try { 710 //open marked from a DS 711 //open the data store 712 DataStore sds1 = Factory.openDataStore 713 ("gate.persist.SerialDataStore", 714 markedDir.toURL().toExternalForm()); 715 716 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl"); 717 boolean found = false; 718 int k = 0; 719 //search for the marked doc with the same name 720 while (k < lrIDs1.size() && !found) { 721 String docID1 = (String) lrIDs1.get(k); 722 723 //read the stored document 724 FeatureMap features1 = Factory.newFeatureMap(); 725 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1); 726 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1); 727 Document tempDoc = (Document) Factory.createResource( 728 "gate.corpora.DocumentImpl", 729 features1); 730 //check whether this is our doc 731 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")). 732 endsWith(cleanDoc.getName())) { 733 found = true; 734 markedDoc = tempDoc; 735 } else k++; 736 } 737 } catch (java.net.MalformedURLException ex) { 738 Out.prln("Error finding marked directory " + markedDir.getAbsolutePath()); 739 } catch (gate.persist.PersistenceException ex1) { 740 Out.prln("Error opening marked as a datastore (-marked_ds specified)"); 741 } catch (gate.creole.ResourceInstantiationException ex2) { 742 Out.prln("Error opening marked as a datastore (-marked_ds specified)"); 743 } 744 } //if using a DS for marked 745 746 try { 747 evaluateDocuments(persDoc, cleanDoc, markedDoc); 748 } catch (gate.creole.ResourceInstantiationException ex) { 749 ex.printStackTrace(); 750 Out.prln("Evaluate failed on document: " + cleanDoc.getName()); 751 } 752 if (persDoc != null) 753 Factory.deleteResource(persDoc); 754 if (cleanDoc != null) 755 Factory.deleteResource(cleanDoc); 756 if (markedDoc != null) 757 Factory.deleteResource(markedDoc); 758 759 }//for loop through clean docs 760 761 762 }//evaluateMarkedClean 763 764 protected void processDocument(Document doc) { 765 try { 766 if (application instanceof CorpusController) { 767 Corpus tempCorpus = Factory.newCorpus("temp"); 768 tempCorpus.add(doc); 769 ((CorpusController)application).setCorpus(tempCorpus); 770 application.execute(); 771 Factory.deleteResource(tempCorpus); 772 tempCorpus = null; 773 } else { 774 Iterator iter = application.getPRs().iterator(); 775 while (iter.hasNext()) 776 ((ProcessingResource) iter.next()).setParameterValue("document", doc); 777 application.execute(); 778 } 779 } catch (ResourceInstantiationException ex) { 780 throw new RuntimeException("Error executing application: " 781 + ex.getMessage()); 782 } catch (ExecutionException ex) { 783 throw new RuntimeException("Error executing application: " 784 + ex.getMessage()); 785 } 786 } 787 788 protected void evaluateDocuments(Document persDoc, 789 Document cleanDoc, Document markedDoc) 790 throws ResourceInstantiationException { 791 if (cleanDoc == null && markedDoc == null) 792 return; 793 794 //we've got no types to compare 795 if (annotTypes == null || annotTypes.isEmpty()) 796 return; 797 798 if (cleanDoc != null && !isMarkedStored) { 799 800 processDocument(cleanDoc); 801 802 if(!isMarkedClean) 803 evaluateAllThree(persDoc, cleanDoc, markedDoc); 804 else 805 evaluateTwoDocs(markedDoc, cleanDoc); 806 807 } else 808 evaluateTwoDocs(markedDoc, persDoc); 809 810 } 811 812 protected void evaluateAllThree(Document persDoc, 813 Document cleanDoc, Document markedDoc) 814 throws ResourceInstantiationException { 815 //first start the table and its header 816 printTableHeader(); 817 for (int jj= 0; jj< annotTypes.size(); jj++) { 818 String annotType = (String) annotTypes.get(jj); 819 820 AnnotationDiff annotDiff=measureDocs(markedDoc, cleanDoc, annotType); 821 //we don't have this annotation type in this document 822 if (annotDiff == null) 823 continue; 824 Out.prln("<TR>"); 825 826 //increase the number of processed documents 827 docNumber++; 828 //add precison and recall to the sums 829 updateStatistics(annotDiff, annotType); 830 831 Out.prln("<TD> Annotation type: " + annotType + "</TD>"); 832 833 AnnotationDiff annotDiff1 = 834 measureDocs(markedDoc, persDoc, annotType); 835 836 Out.prln("<TD>" + annotDiff.getPrecisionAverage()); 837 //check the precision first 838 if (annotDiff1 != null && 839 annotDiff!= null && 840 annotDiff1.getPrecisionAverage()<annotDiff.getPrecisionAverage() 841 ) 842 Out.prln("<P> Precision increase on human-marked from " + 843 annotDiff1.getPrecisionAverage() + " to " + 844 annotDiff.getPrecisionAverage() + "</P>"); 845 else if (annotDiff1 != null 846 && annotDiff != null 847 && annotDiff1.getPrecisionAverage() 848 > annotDiff.getPrecisionAverage()) 849 Out.prln("<P> Precision decrease on human-marked from " + 850 annotDiff1.getPrecisionAverage() + " to " + 851 annotDiff.getPrecisionAverage() + "</P>"); 852 Out.prln("</TD>"); 853 854 Out.prln("<TD>" + annotDiff.getRecallAverage()); 855 //check the recall now 856 if (annotDiff1 != null && 857 annotDiff!= null && 858 annotDiff1.getRecallAverage()<annotDiff.getRecallAverage() 859 ) 860 Out.prln("<P> Recall increase on human-marked from " + 861 annotDiff1.getRecallAverage() + " to " + 862 annotDiff.getRecallAverage() + "</P>"); 863 else if (annotDiff1 != null 864 && annotDiff != null 865 && annotDiff1.getRecallAverage() 866 > annotDiff.getRecallAverage()) 867 Out.prln("<P> Recall decrease on human-marked from " + 868 annotDiff1.getRecallAverage() + " to " + 869 annotDiff.getRecallAverage() + "</P>"); 870 871 Out.prln("</TD>"); 872 873 //check the recall now 874 if ( isVerboseMode 875 && 876 ((annotDiff.getRecallAverage() < threshold 877 || 878 annotDiff.getRecallAverage() < threshold) 879 ) 880 ) 881 printAnnotations(annotDiff, markedDoc, cleanDoc); 882 883 884 Out.prln("</TR>"); 885 }//for loop through annotation types 886 Out.prln("</TABLE>"); 887 888 }//evaluateAllThree 889 890 protected void evaluateTwoDocs(Document keyDoc, Document respDoc) 891 throws ResourceInstantiationException { 892 893 //first start the table and its header 894 printTableHeader(); 895 for (int jj= 0; jj< annotTypes.size(); jj++) { 896 String annotType = (String) annotTypes.get(jj); 897 898 AnnotationDiff annotDiff=measureDocs(keyDoc, respDoc, annotType); 899 //we don't have this annotation type in this document 900 if (annotDiff == null) 901 continue; 902 Out.prln("<TR>"); 903 904 //increase the number of processed documents 905 docNumber++; 906 //add precison and recall to the sums 907 updateStatistics(annotDiff, annotType); 908 909 Out.prln("<TD>" + annotType + "</TD>"); 910 911 Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>"); 912 Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>"); 913 //check the recall now 914 if ( isVerboseMode 915 && 916 ((annotDiff.getRecallAverage() < threshold 917 || 918 annotDiff.getRecallAverage() < threshold) 919 ) 920 ) 921 printAnnotations(annotDiff, keyDoc, respDoc); 922 923 Out.prln("</TR>"); 924 }//for loop through annotation types 925 Out.prln("</TABLE>"); 926 927 }//evaluateTwoDocs 928 929 protected void printTableHeader() { 930 Out.prln("<TABLE BORDER=1"); 931 if (isVerboseMode) 932 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> " 933 + "<TD><B>Recall</B></TD> <TD><B>Annotations<B></TD>"); 934 else 935 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> " 936 + "<TD><B>Recall</B></TD>"); 937 } 938 939 protected void updateStatistics(AnnotationDiff annotDiff, String annotType){ 940 precisionSum += annotDiff.getPrecisionAverage(); 941 recallSum += annotDiff.getRecallAverage(); 942 fMeasureSum += annotDiff.getFMeasureAverage(); 943 Double oldPrecision = (Double) precisionByType.get(annotType); 944 if (oldPrecision == null) 945 precisionByType.put(annotType, 946 new Double(annotDiff.getPrecisionAverage())); 947 else 948 precisionByType.put(annotType, 949 new Double(oldPrecision.doubleValue() + 950 annotDiff.getPrecisionAverage())); 951 Integer precCount = (Integer) prCountByType.get(annotType); 952 if (precCount == null) 953 prCountByType.put(annotType, new Integer(1)); 954 else 955 prCountByType.put(annotType, new Integer(precCount.intValue() + 1)); 956 957 958 Double oldFMeasure = (Double) fMeasureByType.get(annotType); 959 if (oldFMeasure == null) 960 fMeasureByType.put(annotType, 961 new Double(annotDiff.getFMeasureAverage())); 962 else 963 fMeasureByType.put(annotType, 964 new Double(oldFMeasure.doubleValue() + 965 annotDiff.getFMeasureAverage())); 966 Integer fCount = (Integer) fMeasureCountByType.get(annotType); 967 if (fCount == null) 968 fMeasureCountByType.put(annotType, new Integer(1)); 969 else 970 fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1)); 971 972 Double oldRecall = (Double) recallByType.get(annotType); 973 if (oldRecall == null) 974 recallByType.put(annotType, 975 new Double(annotDiff.getRecallAverage())); 976 else 977 recallByType.put(annotType, 978 new Double(oldRecall.doubleValue() + 979 annotDiff.getRecallAverage())); 980 Integer recCount = (Integer) recCountByType.get(annotType); 981 if (recCount == null) 982 recCountByType.put(annotType, new Integer(1)); 983 else 984 recCountByType.put(annotType, new Integer(recCount.intValue() + 1)); 985 } 986 987 protected void printStatistics() { 988 989 Out.prln("<H2> Statistics </H2>"); 990 Out.prln("<H3> Precision </H3>"); 991 if (precisionByType != null && !precisionByType.isEmpty()) { 992 Iterator iter = precisionByType.keySet().iterator(); 993 while (iter.hasNext()) { 994 String annotType = (String) iter.next(); 995 Out.prln(annotType + ": " 996 + ((Double)precisionByType.get(annotType)).doubleValue() 997 / 998 ((Integer)prCountByType.get(annotType)).intValue() 999 + "<P>"); 1000 }//while 1001 } 1002 Out.prln("Overall precision: " + getPrecisionAverage() + "<P>"); 1003 1004 Out.prln("<H3> Recall </H3>"); 1005 if (recallByType != null && !recallByType.isEmpty()) { 1006 Iterator iter = recallByType.keySet().iterator(); 1007 while (iter.hasNext()) { 1008 String annotType = (String) iter.next(); 1009 Out.prln(annotType + ": " 1010 + ((Double)recallByType.get(annotType)).doubleValue() 1011 / 1012 ((Integer)recCountByType.get(annotType)).intValue() 1013 + "<P>"); 1014 }//while 1015 } 1016 1017 Out.prln("Overall recall: " + getRecallAverage() 1018 + "<P>"); 1019 1020 Out.prln("<H3> F-Measure </H3>"); 1021 if (fMeasureByType != null && !fMeasureByType.isEmpty()) { 1022 Iterator iter = fMeasureByType.keySet().iterator(); 1023 while (iter.hasNext()) { 1024 String annotType = (String) iter.next(); 1025 Out.prln(annotType + ": " 1026 + ((Double)fMeasureByType.get(annotType)).doubleValue() 1027 / 1028 ((Integer)fMeasureCountByType.get(annotType)).intValue() 1029 + "<P>"); 1030 }//while 1031 } 1032 1033 Out.prln("Overall average fMeasure: " + fMeasureSum/docNumber 1034 + "<P>"); 1035 1036 } 1037 1038 protected AnnotationDiff measureDocs( 1039 Document keyDoc, Document respDoc, String annotType) 1040 throws ResourceInstantiationException { 1041 1042 if (keyDoc == null || respDoc == null) 1043 return null; 1044 1045 if (annotSetName != null 1046 && keyDoc.getAnnotations(annotSetName).get(annotType) == null) 1047 return null; 1048 else if ((annotSetName == null || annotSetName.equals("")) 1049 && keyDoc.getAnnotations().get(annotType) == null) 1050 return null; 1051 1052 // create the annotation schema needed for AnnotationDiff 1053 AnnotationSchema annotationSchema = new AnnotationSchema(); 1054 1055 // set annotation type 1056 annotationSchema.setAnnotationName(annotType); 1057 // create an annotation diff 1058 AnnotationDiff annotDiff = new AnnotationDiff(); 1059 annotDiff.setAnnotationSchema(annotationSchema); 1060 annotDiff.setKeyDocument(keyDoc); 1061 annotDiff.setResponseDocument(respDoc); 1062 annotDiff.setKeyAnnotationSetName(annotSetName); 1063 annotDiff.setResponseAnnotationSetName(outputSetName); 1064 annotDiff.setKeyFeatureNamesSet(new HashSet()); 1065 annotDiff.setTextMode(new Boolean(true)); 1066 annotDiff.init(); 1067 1068 return annotDiff; 1069 } 1070 1071 protected void printAnnotations(AnnotationDiff annotDiff, 1072 Document keyDoc, Document respDoc) { 1073 Out.prln("<TD>"); 1074 Out.pr("MISSING ANNOTATIONS in the automatic texts: "); 1075 Set missingSet = 1076 annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE); 1077 printAnnotations(missingSet, keyDoc); 1078 Out.prln("<BR>"); 1079 1080 Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: "); 1081 Set spuriousSet = 1082 annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE); 1083 printAnnotations(spuriousSet, respDoc); 1084 Out.prln("</BR>"); 1085 1086 Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: "); 1087 Set partialSet = 1088 annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE); 1089 printAnnotations(partialSet, respDoc); 1090 Out.prln("</TD>"); 1091 1092 } 1093 1094 protected void printAnnotations(Set set, Document doc) { 1095 if (set == null || set.isEmpty()) 1096 return; 1097 1098 Iterator iter = set.iterator(); 1099 while (iter.hasNext()) { 1100 Annotation ann = (Annotation) iter.next(); 1101 Out.prln( 1102 "<B>" + 1103 doc.getContent().toString().substring( 1104 ann.getStartNode().getOffset().intValue(), 1105 ann.getEndNode().getOffset().intValue()) + 1106 "</B>: <I>[" + ann.getStartNode().getOffset() + 1107 "," + ann.getEndNode().getOffset() + "]</I>" 1108// + "; features" + ann.getFeatures() 1109 ); 1110 }//while 1111 } 1112 1113 /** 1114 * The directory from which we should generate/evaluate the corpus 1115 */ 1116 private File startDir; 1117 private File currDir; 1118 private static List annotTypes; 1119 1120 private Controller application = null; 1121 private File applicationFile = null; 1122 1123 //collect the sum of all precisions and recalls of all docs 1124 //and the number of docs, so I can calculate the average for 1125 //the corpus at the end 1126 private double precisionSum = 0; 1127 private double recallSum = 0; 1128 private double fMeasureSum = 0; 1129 private HashMap precisionByType = new HashMap(); 1130 private HashMap prCountByType = new HashMap(); 1131 private HashMap recallByType = new HashMap(); 1132 private HashMap recCountByType = new HashMap(); 1133 private HashMap fMeasureByType = new HashMap(); 1134 private HashMap fMeasureCountByType = new HashMap(); 1135 private int docNumber = 0; 1136 1137 /** 1138 * If true, the corpus tool will generate the corpus, otherwise it'll 1139 * run in evaluate mode 1140 */ 1141 private boolean isGenerateMode = false; 1142 private boolean isVerboseMode = false; 1143 1144 /** 1145 * If true, the corpus tool will evaluate stored against the human-marked 1146 * documents 1147 */ 1148 private boolean isMarkedStored = false; 1149 private boolean isMarkedClean = false; 1150 //whether marked are in a DS, not xml 1151 private boolean isMarkedDS = false; 1152 1153 private String annotSetName = "Key"; 1154 private String outputSetName = null; 1155 1156 private double threshold = 0.5; 1157 private Properties configs = new Properties(); 1158 1159 /** String to print when wrong command-line args */ 1160 private static String usage = 1161 "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] [-verbose] directory-name application"; 1162 1163}
|
CorpusBenchmarkTool |
|