|
CorpusBenchmarkTool |
|
1 /* 2 * CorpusBenchmarkTool.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 24/Oct/2001 12 * 13 * $Id: CorpusBenchmarkTool.java,v 1.41 2003/06/24 22:03:04 kalina Exp $ 14 */ 15 16 package gate.util; 17 18 import java.util.*; 19 import java.io.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.persist.*; 25 import gate.creole.tokeniser.*; 26 import gate.creole.gazetteer.*; 27 import gate.creole.splitter.*; 28 import gate.creole.orthomatcher.*; 29 import gate.creole.annotransfer.*; 30 import gate.annotation.*; 31 32 public class CorpusBenchmarkTool { 33 private static final String MARKED_DIR_NAME = "marked"; 34 private static final String CLEAN_DIR_NAME = "clean"; 35 private static final String CVS_DIR_NAME = "Cvs"; 36 private static final String PROCESSED_DIR_NAME = "processed"; 37 private static final String ERROR_DIR_NAME = "err"; 38 39 private static final boolean DEBUG = true; 40 41 public CorpusBenchmarkTool() {} 42 43 public void initPRs() { 44 try { 45 if (applicationFile == null) 46 Out.prln("Application not set!"); 47 Out.prln("App file is: " + applicationFile.getAbsolutePath()); 48 application = (Controller) gate.util.persistence.PersistenceManager 49 .loadObjectFromFile(applicationFile); 50 } catch (Exception ex) { 51 throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage()); 52 } 53 }//initPRs 54 55 public void unloadPRs() { 56 //we have nothing to unload if no PRs are loaded 57 if (isMarkedStored) 58 return; 59 60 } 61 62 public void execute() { 63 /* 64 Out.prln("Flags Gen Cln Str Vrb Minf: " 65 + isGenerateMode +" "+ isMarkedClean +" "+ isMarkedStored 66 +" "+ isVerboseMode +" "+ isMoreInfoMode); 67 */ 68 execute(startDir); 69 if (application != null) { 70 Iterator iter = new ArrayList(application.getPRs()).iterator(); 71 while (iter.hasNext()) 72 Factory.deleteResource((Resource) iter.next()); 73 Factory.deleteResource(application); 74 } 75 } 76 77 public void init() { 78 //first read the corpus_tool.properties file 79 File propFile = new File("corpus_tool.properties"); 80 Out.prln(propFile.getAbsolutePath()); 81 if (propFile.exists()) { 82 try { 83 InputStream inputStream = new FileInputStream(propFile); 84 this.configs.load(inputStream); 85 String thresholdString = this.configs.getProperty("threshold"); 86 if (thresholdString != null && !thresholdString.equals("")) { 87 this.threshold = (new Double(thresholdString)).doubleValue(); 88 Out.prln("New threshold is: " + this.threshold + "<P>\n"); 89 } 90 String setName = this.configs.getProperty("annotSetName"); 91 if (setName != null && !setName.equals("")) { 92 Out.prln("Annotation set in marked docs is: " + setName + " <P>\n"); 93 this.annotSetName = setName; 94 } 95 setName = this.configs.getProperty("outputSetName"); 96 if (setName != null && !setName.equals("")) { 97 Out.prln("Annotation set in processed docs is: " + setName + " <P>\n"); 98 this.outputSetName = setName; 99 } 100 String encodingString = this.configs.getProperty("encoding"); 101 if (encodingString != null && !encodingString.equals("")) { 102 this.documentEncoding = encodingString; 103 Out.prln("New encoding is: " + this.documentEncoding + "<P>\n"); 104 } 105 String types = this.configs.getProperty("annotTypes"); 106 if (types != null && !types.equals("")) { 107 Out.prln("Using annotation types from the properties file. <P>\n"); 108 StringTokenizer strTok = new StringTokenizer(types, ";"); 109 annotTypes = new ArrayList(); 110 while (strTok.hasMoreTokens()) 111 annotTypes.add(strTok.nextToken()); 112 } else { 113 annotTypes = new ArrayList(); 114 annotTypes.add("Organization"); 115 annotTypes.add("Person"); 116 annotTypes.add("Date"); 117 annotTypes.add("Location"); 118 annotTypes.add("Address"); 119 annotTypes.add("Money"); 120 annotTypes.add("Percent"); 121 annotTypes.add("GPE"); 122 annotTypes.add("Facility"); 123 } 124 String features = this.configs.getProperty("annotFeatures"); 125 HashSet result = new HashSet(); 126 if (features != null && !features.equals("")) { 127 Out.pr("Using annotation features from the properties file. \n"); 128 java.util.StringTokenizer tok = 129 new java.util.StringTokenizer(features, ";"); 130 String current; 131 while(tok.hasMoreTokens()) { 132 current = tok.nextToken(); 133 result.add(current); 134 } // while 135 } 136 diffFeaturesSet = result; 137 Out.prln("Features: "+diffFeaturesSet+" <P>\n"); 138 139 } catch (IOException ex) { 140 //just ignore the file and go on with the defaults 141 this.configs = new Properties(); 142 } 143 } else 144 this.configs = new Properties(); 145 146 147 //we only initialise the PRs if they are going to be used 148 //for processing unprocessed documents 149 if (!this.isMarkedStored) 150 initPRs(); 151 152 } 153 154 public void execute(File dir) { 155 if (dir == null) 156 return; 157 //first set the current directory to be the given one 158 currDir = dir; 159 160 File processedDir = null; 161 File cleanDir = null; 162 File markedDir = null; 163 File errorDir = null; 164 165 ArrayList subDirs = new ArrayList(); 166 File[] dirArray = currDir.listFiles(); 167 if(dirArray == null) return; 168 for (int i = 0; i < dirArray.length; i++) { 169 if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME)) 170 continue; 171 if (dirArray[i].getName().equals(CLEAN_DIR_NAME)) 172 cleanDir = dirArray[i]; 173 else if (dirArray[i].getName().equals(MARKED_DIR_NAME)) 174 markedDir = dirArray[i]; 175 else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME)) 176 processedDir = dirArray[i]; 177 else if (dirArray[i].getName().equals(ERROR_DIR_NAME)) 178 errorDir = dirArray[i]; 179 else 180 subDirs.add(dirArray[i]); 181 } 182 183 if(cleanDir == null) return; 184 Out.prln("Processing directory: " + currDir + "<P>"); 185 186 if (this.isGenerateMode) 187 generateCorpus(cleanDir, processedDir); 188 else 189 evaluateCorpus(cleanDir, processedDir, markedDir, errorDir); 190 191 //if no more subdirs left, return 192 if (subDirs.isEmpty()) 193 return; 194 195 //there are more subdirectories to traverse, so iterate through 196 for (int j = 0; j < subDirs.size(); j++) 197 execute((File) subDirs.get(j)); 198 199 }//execute(dir) 200 201 202 public static void main(String[] args) throws GateException { 203 Out.prln("<HTML>"); 204 Out.prln("<HEAD>"); 205 Out.prln("<TITLE> Corpus benchmark tool: ran with args "); 206 for(int argC=0; argC < args.length; ++argC) 207 Out.pr(args[argC]+" "); 208 Out.pr(" on " + new Date() + "</TITLE> </HEAD>"); 209 Out.prln("<BODY>"); 210 Out.prln("Please wait while GATE tools are initialised. <P>"); 211 // initialise GATE 212 Gate.init(); 213 214 CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool(); 215 216 List inputFiles = null; 217 if(args.length < 1) throw new GateException(usage); 218 int i = 0; 219 while (i < args.length && args[i].startsWith("-")) { 220 if(args[i].equals("-generate")) { 221 Out.prln("Generating the corpus... <P>"); 222 corpusTool.setGenerateMode(true); 223 } else if (args[i].equals("-marked_clean")) { 224 Out.prln("Evaluating current grammars against human-annotated...<P>"); 225 corpusTool.setMarkedClean(true); 226 } else if (args[i].equals("-marked_stored")) { 227 Out.prln("Evaluating stored documents against human-annotated...<P>"); 228 corpusTool.setMarkedStored(true); 229 } else if (args[i].equals("-marked_ds")) { 230 Out.prln("Looking for marked docs in a datastore...<P>"); 231 corpusTool.setMarkedDS(true); 232 } else if (args[i].equals("-verbose")) { 233 Out.prln("Running in verbose mode. Will generate annotation " + 234 "information when precision/recall are lower than " + 235 corpusTool.getThreshold() +"<P>"); 236 corpusTool.setVerboseMode(true); 237 } else if (args[i].equals("-moreinfo")) { 238 Out.prln("Show more details in document table...<P>"); 239 corpusTool.setMoreInfo(true); 240 } 241 i++; //just ignore the option, which we do not recognise 242 }//while 243 244 String dirName = args[i]; 245 File dir = new File(dirName); 246 if (!dir.isDirectory()) 247 throw new GateException(usage); 248 249 //get the last argument which is the application 250 i++; 251 String appName = args[i]; 252 File appFile = new File(appName); 253 if (!appFile.isFile()) 254 throw new GateException(usage); 255 else 256 corpusTool.setApplicationFile(appFile); 257 258 corpusTool.init(); 259 corpusWordCount = 0; 260 261 Out.prln("Measuring annotaitions of types: " + corpusTool.annotTypes + "<P>"); 262 263 corpusTool.setStartDirectory(dir); 264 corpusTool.execute(); 265 266 //if we're not generating the corpus, then print the precision and recall 267 //statistics for the processed corpus 268 if (! corpusTool.getGenerateMode()) 269 corpusTool.printStatistics(); 270 271 Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage()); 272 Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage()); 273 if(corpusWordCount == 0) 274 Out.prln("<BR>No Token annotations to count words in the corpus."); 275 else 276 Out.prln("<BR>Overall word count: " + corpusWordCount); 277 278 279 if(hasProcessed) { 280 Out.prln("<P>Old Processed: "); 281 Out.prln("<BR>Overall average precision: " 282 + corpusTool.getPrecisionAverageProc()); 283 Out.prln("<BR>Overall average recall: " 284 + corpusTool.getRecallAverageProc()); 285 } 286 Out.prln("<BR>Finished! <P>"); 287 Out.prln("</BODY>"); 288 Out.prln("</HTML>"); 289 290 System.exit(0); 291 292 }//main 293 294 public void setGenerateMode(boolean mode) { 295 isGenerateMode = mode; 296 }//setGenerateMode 297 298 public boolean getGenerateMode() { 299 return isGenerateMode; 300 }//getGenerateMode 301 302 public boolean getVerboseMode() { 303 return isVerboseMode; 304 }//getVerboseMode 305 306 public void setVerboseMode(boolean mode) { 307 isVerboseMode = mode; 308 }//setVerboseMode 309 310 public void setMoreInfo(boolean mode) { 311 isMoreInfoMode = mode; 312 } // setMoreInfo 313 314 public boolean getMoreInfo() { 315 return isMoreInfoMode; 316 } // getMoreInfo 317 318 public void setDiffFeaturesList(Set features) { 319 diffFeaturesSet = features; 320 } // setDiffFeaturesList 321 322 public Set getDiffFeaturesList() { 323 return diffFeaturesSet; 324 } // getDiffFeaturesList 325 326 public void setMarkedStored(boolean mode) { 327 isMarkedStored = mode; 328 }// setMarkedStored 329 330 331 public boolean getMarkedStored() { 332 return isMarkedStored; 333 }// getMarkedStored 334 335 public void setMarkedClean(boolean mode) { 336 isMarkedClean = mode; 337 }// 338 339 public boolean getMarkedClean() { 340 return isMarkedClean; 341 }// 342 343 public void setMarkedDS(boolean mode) { 344 isMarkedDS = mode; 345 }// 346 347 public boolean getMarkedDS() { 348 return isMarkedDS; 349 }// 350 351 public void setApplicationFile(File newAppFile) { 352 applicationFile = newAppFile; 353 } 354 355 /** 356 * Returns the average precision over the entire set of processed documents. 357 * <P> 358 * If the tool has been evaluating the original documents against the 359 * previously-stored automatically annotated ones, then the precision 360 * will be the average precision on those two sets. <P> 361 * If the tool was run in -marked mode, i.e., was evaluating the stored 362 * automatically processed ones against the human-annotated ones, then 363 * the precision will be the average precision on those two sets of documents. 364 */ 365 public double getPrecisionAverage() { 366 return precisionSum/docNumber; 367 } 368 369 /** 370 * Returns the average recall over the entire set of processed documents. 371 * <P> 372 * If the tool has been evaluating the original documents against the 373 * previously-stored automatically annotated ones, then the recall 374 * will be the average recall on those two sets. <P> 375 * If the tool was run in -marked mode, i.e., was evaluating the stored 376 * automatically processed ones against the human-annotated ones, then 377 * the recall will be the average recall on those two sets of documents. 378 */ 379 public double getRecallAverage() { 380 return recallSum/docNumber; 381 } 382 383 /** For processed documents */ 384 public double getPrecisionAverageProc() { 385 return proc_precisionSum/docNumber; 386 } 387 public double getRecallAverageProc() { 388 return proc_recallSum/docNumber; 389 } 390 391 392 public boolean isGenerateMode() { 393 return isGenerateMode == true; 394 }//isGenerateMode 395 396 public double getThreshold() { 397 return threshold; 398 } 399 400 public void setThreshold(double newValue) { 401 threshold = newValue; 402 } 403 404 public File getStartDirectory() { 405 return startDir; 406 }//getStartDirectory 407 408 public void setStartDirectory(File dir) { 409 startDir = dir; 410 }//setStartDirectory 411 412 protected void generateCorpus(File fileDir, File outputDir) { 413 //1. check if we have input files 414 if (fileDir == null) 415 return; 416 //2. create the output directory or clean it up if needed 417 File outDir = outputDir; 418 if (outputDir == null) { 419 outDir = new File(currDir, PROCESSED_DIR_NAME); 420 } else { 421 // get rid of the directory, coz datastore wants it clean 422 if (!Files.rmdir(outDir)) 423 Out.prln("cannot delete old output directory: " + outDir); 424 } 425 outDir.mkdir(); 426 427 //create the datastore and process each document 428 try { 429 SerialDataStore sds = new SerialDataStore(outDir.toURL().toString()); 430 sds.create(); 431 sds.open(); 432 433 File[] files = fileDir.listFiles(); 434 for (int i=0; i < files.length; i++) { 435 if (!files[i].isFile()) 436 continue; 437 // create a document 438 Out.prln("Processing and storing document: " + files[i].toURL() +"<P>"); 439 440 FeatureMap params = Factory.newFeatureMap(); 441 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL()); 442 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding); 443 444 // create the document 445 Document doc = (Document) Factory.createResource( 446 "gate.corpora.DocumentImpl", params 447 ); 448 449 doc.setName(files[i].getName()); 450 if (doc == null) 451 continue; 452 processDocument(doc); 453 LanguageResource lr = sds.adopt(doc, null); 454 sds.sync(lr); 455 Factory.deleteResource(doc); 456 Factory.deleteResource(lr); 457 }//for 458 sds.close(); 459 } catch (java.net.MalformedURLException ex) { 460 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 461 } catch (PersistenceException ex1) { 462 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 463 } catch (ResourceInstantiationException ex2) { 464 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 465 } catch (gate.security.SecurityException ex3) { 466 throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage()); 467 } 468 469 }//generateCorpus 470 471 protected void evaluateCorpus(File fileDir, 472 File processedDir, File markedDir, 473 File errorDir) { 474 //1. check if we have input files and the processed Dir 475 if (fileDir == null || !fileDir.exists()) 476 return; 477 if (processedDir == null || !processedDir.exists()) 478 //if the user wants evaluation of marked and stored that's not possible 479 if (isMarkedStored) { 480 Out.prln("Cannot evaluate because no processed documents exist."); 481 return; 482 } 483 else 484 isMarkedClean = true; 485 486 // create the error directory or clean it up if needed 487 File errDir = null; 488 if(isMoreInfoMode) { 489 errDir = errorDir; 490 if (errDir == null) { 491 errDir = new File(currDir, ERROR_DIR_NAME); 492 } 493 else { 494 // get rid of the directory, coz we wants it clean 495 if (!Files.rmdir(errDir)) 496 Out.prln("cannot delete old error directory: " + errDir); 497 } 498 Out.prln("Create error directory: " + errDir + "<BR><BR>"); 499 errDir.mkdir(); 500 } 501 502 //looked for marked texts only if the directory exists 503 boolean processMarked = markedDir != null && markedDir.exists(); 504 if (!processMarked && (isMarkedStored || isMarkedClean)) { 505 Out.prln("Cannot evaluate because no human-annotated documents exist."); 506 return; 507 } 508 509 if (isMarkedStored) { 510 evaluateMarkedStored(markedDir, processedDir, errDir); 511 return; 512 } else if (isMarkedClean) { 513 evaluateMarkedClean(markedDir, fileDir, errDir); 514 return; 515 } 516 517 Document persDoc = null; 518 Document cleanDoc = null; 519 Document markedDoc = null; 520 521 //open the datastore and process each document 522 try { 523 //open the data store 524 DataStore sds = Factory.openDataStore 525 ("gate.persist.SerialDataStore", 526 processedDir.toURL().toExternalForm()); 527 528 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl"); 529 for (int i=0; i < lrIDs.size(); i++) { 530 String docID = (String) lrIDs.get(i); 531 532 //read the stored document 533 FeatureMap features = Factory.newFeatureMap(); 534 features.put(DataStore.DATASTORE_FEATURE_NAME, sds); 535 features.put(DataStore.LR_ID_FEATURE_NAME, docID); 536 persDoc = (Document) Factory.createResource( 537 "gate.corpora.DocumentImpl", 538 features); 539 540 if(isMoreInfoMode) { 541 StringBuffer errName = new StringBuffer(persDoc.getName()); 542 errName.replace( 543 persDoc.getName().lastIndexOf("."), 544 persDoc.getName().length(), 545 ".err"); 546 Out.prln("<H2>" + 547 "<a href=err/" + errName.toString() + ">" 548 + persDoc.getName() + "</a>" + "</H2>"); 549 } else 550 Out.prln("<H2>" + persDoc.getName() + "</H2>"); 551 552 File cleanDocFile = new File(fileDir, persDoc.getName()); 553 //try reading the original document from clean 554 if (! cleanDocFile.exists()) { 555 Out.prln("Warning: Cannot find original document " + 556 persDoc.getName() + " in " + fileDir); 557 } else { 558 FeatureMap params = Factory.newFeatureMap(); 559 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL()); 560 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding); 561 562 // create the document 563 cleanDoc = (Document) Factory.createResource( 564 "gate.corpora.DocumentImpl", params); 565 cleanDoc.setName(persDoc.getName()); 566 } 567 568 //try finding the marked document 569 StringBuffer docName = new StringBuffer(persDoc.getName()); 570 if (! isMarkedDS) { 571 docName.replace( 572 persDoc.getName().lastIndexOf("."), 573 docName.length(), 574 ".xml"); 575 File markedDocFile = new File(markedDir, docName.toString()); 576 if (! processMarked || ! markedDocFile.exists()) { 577 Out.prln("Warning: Cannot find human-annotated document " + 578 markedDocFile + " in " + markedDir); 579 } else { 580 FeatureMap params = Factory.newFeatureMap(); 581 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 582 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding); 583 584 // create the document 585 markedDoc = (Document) Factory.createResource( 586 "gate.corpora.DocumentImpl", params); 587 markedDoc.setName(persDoc.getName()); 588 } 589 } else { 590 //open marked from a DS 591 //open the data store 592 DataStore sds1 = Factory.openDataStore 593 ("gate.persist.SerialDataStore", 594 markedDir.toURL().toExternalForm()); 595 596 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl"); 597 boolean found = false; 598 int k = 0; 599 //search for the marked doc with the same name 600 while (k < lrIDs1.size() && !found) { 601 String docID1 = (String) lrIDs1.get(k); 602 603 //read the stored document 604 FeatureMap features1 = Factory.newFeatureMap(); 605 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1); 606 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1); 607 Document tempDoc = (Document) Factory.createResource( 608 "gate.corpora.DocumentImpl", 609 features1); 610 //check whether this is our doc 611 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")). 612 endsWith(persDoc.getName())) { 613 found = true; 614 markedDoc = tempDoc; 615 } else k++; 616 } 617 } 618 619 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir); 620 if (persDoc != null) 621 Factory.deleteResource(persDoc); 622 if (cleanDoc != null) 623 Factory.deleteResource(cleanDoc); 624 if (markedDoc != null) 625 Factory.deleteResource(markedDoc); 626 627 }//for loop through saved docs 628 sds.close(); 629 } catch (java.net.MalformedURLException ex) { 630 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 631 } catch (PersistenceException ex1) { 632 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 633 } catch (ResourceInstantiationException ex2) { 634 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 635 } 636 637 }//evaluateCorpus 638 639 protected void evaluateMarkedStored(File markedDir, File storedDir, File errDir) { 640 Document persDoc = null; 641 Document cleanDoc = null; 642 Document markedDoc = null; 643 644 //open the datastore and process each document 645 try { 646 //open the data store 647 DataStore sds = Factory.openDataStore 648 ("gate.persist.SerialDataStore", 649 storedDir.toURL().toExternalForm()); 650 651 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl"); 652 for (int i=0; i < lrIDs.size(); i++) { 653 String docID = (String) lrIDs.get(i); 654 655 //read the stored document 656 FeatureMap features = Factory.newFeatureMap(); 657 features.put(DataStore.DATASTORE_FEATURE_NAME, sds); 658 features.put(DataStore.LR_ID_FEATURE_NAME, docID); 659 persDoc = (Document) Factory.createResource( 660 "gate.corpora.DocumentImpl", 661 features); 662 663 if(isMoreInfoMode) { 664 StringBuffer errName = new StringBuffer(persDoc.getName()); 665 errName.replace( 666 persDoc.getName().lastIndexOf("."), 667 persDoc.getName().length(), 668 ".err"); 669 Out.prln("<H2>" + 670 "<a href=err/" + errName.toString() + ">" 671 + persDoc.getName() + "</a>" + "</H2>"); 672 } else 673 Out.prln("<H2>" + persDoc.getName() + "</H2>"); 674 675 if (! this.isMarkedDS) { //try finding the marked document as file 676 StringBuffer docName = new StringBuffer(persDoc.getName()); 677 docName.replace( 678 persDoc.getName().lastIndexOf("."), 679 docName.length(), 680 ".xml"); 681 File markedDocFile = new File(markedDir, docName.toString()); 682 if (! markedDocFile.exists()) { 683 Out.prln("Warning: Cannot find human-annotated document " + 684 markedDocFile + " in " + markedDir); 685 } else { 686 FeatureMap params = Factory.newFeatureMap(); 687 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 688 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding); 689 690 // create the document 691 markedDoc = (Document) Factory.createResource( 692 "gate.corpora.DocumentImpl", params); 693 markedDoc.setName(persDoc.getName()); 694 }//find marked as file 695 } else { 696 try { 697 //open marked from a DS 698 //open the data store 699 DataStore sds1 = Factory.openDataStore 700 ("gate.persist.SerialDataStore", 701 markedDir.toURL().toExternalForm()); 702 703 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl"); 704 boolean found = false; 705 int k = 0; 706 //search for the marked doc with the same name 707 while (k < lrIDs1.size() && !found) { 708 String docID1 = (String) lrIDs1.get(k); 709 710 //read the stored document 711 FeatureMap features1 = Factory.newFeatureMap(); 712 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1); 713 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1); 714 Document tempDoc = (Document) Factory.createResource( 715 "gate.corpora.DocumentImpl", 716 features1); 717 //check whether this is our doc 718 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")). 719 endsWith(persDoc.getName())) { 720 found = true; 721 markedDoc = tempDoc; 722 } else k++; 723 } 724 } catch (java.net.MalformedURLException ex) { 725 Out.prln("Error finding marked directory " + markedDir.getAbsolutePath()); 726 } catch (gate.persist.PersistenceException ex1) { 727 Out.prln("Error opening marked as a datastore (-marked_ds specified)"); 728 } catch (gate.creole.ResourceInstantiationException ex2) { 729 Out.prln("Error opening marked as a datastore (-marked_ds specified)"); 730 } 731 } 732 733 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir); 734 if (persDoc != null) 735 Factory.deleteResource(persDoc); 736 if (markedDoc != null) 737 Factory.deleteResource(markedDoc); 738 739 }//for loop through saved docs 740 sds.close(); 741 742 } catch (java.net.MalformedURLException ex) { 743 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 744 } catch (PersistenceException ex1) { 745 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 746 } catch (ResourceInstantiationException ex2) { 747 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 748 } 749 750 }//evaluateMarkedStored 751 752 753 protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) { 754 Document persDoc = null; 755 Document cleanDoc = null; 756 Document markedDoc = null; 757 758 File[] cleanDocs = cleanDir.listFiles(); 759 for (int i = 0; i< cleanDocs.length; i++) { 760 if (!cleanDocs[i].isFile()) 761 continue; 762 763 //try reading the original document from clean 764 FeatureMap params = Factory.newFeatureMap(); 765 try { 766 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL()); 767 } catch (java.net.MalformedURLException ex) { 768 Out.prln("Cannot create document from file: " + 769 cleanDocs[i].getAbsolutePath()); 770 continue; 771 } 772 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 773 774 // create the document 775 try { 776 cleanDoc = (Document) Factory.createResource( 777 "gate.corpora.DocumentImpl", params, 778 null, cleanDocs[i].getName()); 779 } catch (gate.creole.ResourceInstantiationException ex) { 780 Out.prln("Cannot create document from file: " + 781 cleanDocs[i].getAbsolutePath()); 782 continue; 783 } 784 785 if(isMoreInfoMode) { 786 StringBuffer errName = new StringBuffer(cleanDocs[i].getName()); 787 errName.replace( 788 cleanDocs[i].getName().lastIndexOf("."), 789 cleanDocs[i].getName().length(), 790 ".err"); 791 Out.prln("<H2>" + 792 "<a href=err/" + errName.toString() + ">" 793 + cleanDocs[i].getName() + "</a>" + "</H2>"); 794 } else 795 Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>"); 796 797 //try finding the marked document 798 if (! isMarkedDS) { 799 StringBuffer docName = new StringBuffer(cleanDoc.getName()); 800 docName.replace( 801 cleanDoc.getName().lastIndexOf("."), 802 docName.length(), 803 ".xml"); 804 File markedDocFile = new File(markedDir, docName.toString()); 805 if (! markedDocFile.exists()) { 806 Out.prln("Warning: Cannot find human-annotated document " + 807 markedDocFile + " in " + markedDir); 808 continue; 809 } else { 810 params = Factory.newFeatureMap(); 811 try { 812 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 813 } catch (java.net.MalformedURLException ex) { 814 Out.prln("Cannot create document from file: " + 815 markedDocFile.getAbsolutePath()); 816 continue; 817 } 818 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 819 820 // create the document 821 try { 822 markedDoc = (Document) Factory.createResource( 823 "gate.corpora.DocumentImpl", params, 824 null, cleanDoc.getName()); 825 } catch (gate.creole.ResourceInstantiationException ex) { 826 Out.prln("Cannot create document from file: " + 827 markedDocFile.getAbsolutePath()); 828 continue; 829 } 830 831 }//if markedDoc exists 832 } else { 833 try { 834 //open marked from a DS 835 //open the data store 836 DataStore sds1 = Factory.openDataStore 837 ("gate.persist.SerialDataStore", 838 markedDir.toURL().toExternalForm()); 839 840 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl"); 841 boolean found = false; 842 int k = 0; 843 //search for the marked doc with the same name 844 while (k < lrIDs1.size() && !found) { 845 String docID1 = (String) lrIDs1.get(k); 846 847 //read the stored document 848 FeatureMap features1 = Factory.newFeatureMap(); 849 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1); 850 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1); 851 Document tempDoc = (Document) Factory.createResource( 852 "gate.corpora.DocumentImpl", 853 features1); 854 //check whether this is our doc 855 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")). 856 endsWith(cleanDoc.getName())) { 857 found = true; 858 markedDoc = tempDoc; 859 } else k++; 860 } 861 } catch (java.net.MalformedURLException ex) { 862 Out.prln("Error finding marked directory " + markedDir.getAbsolutePath()); 863 } catch (gate.persist.PersistenceException ex1) { 864 Out.prln("Error opening marked as a datastore (-marked_ds specified)"); 865 } catch (gate.creole.ResourceInstantiationException ex2) { 866 Out.prln("Error opening marked as a datastore (-marked_ds specified)"); 867 } 868 } //if using a DS for marked 869 870 try { 871 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir); 872 } catch (gate.creole.ResourceInstantiationException ex) { 873 ex.printStackTrace(); 874 Out.prln("Evaluate failed on document: " + cleanDoc.getName()); 875 } 876 if (persDoc != null) 877 Factory.deleteResource(persDoc); 878 if (cleanDoc != null) 879 Factory.deleteResource(cleanDoc); 880 if (markedDoc != null) 881 Factory.deleteResource(markedDoc); 882 883 }//for loop through clean docs 884 885 886 }//evaluateMarkedClean 887 888 protected void processDocument(Document doc) { 889 try { 890 if (application instanceof CorpusController) { 891 Corpus tempCorpus = Factory.newCorpus("temp"); 892 tempCorpus.add(doc); 893 ((CorpusController)application).setCorpus(tempCorpus); 894 application.execute(); 895 Factory.deleteResource(tempCorpus); 896 tempCorpus = null; 897 } else { 898 Iterator iter = application.getPRs().iterator(); 899 while (iter.hasNext()) 900 ((ProcessingResource) iter.next()).setParameterValue("document", doc); 901 application.execute(); 902 } 903 } catch (ResourceInstantiationException ex) { 904 throw new RuntimeException("Error executing application: " 905 + ex.getMessage()); 906 } catch (ExecutionException ex) { 907 throw new RuntimeException("Error executing application: " 908 + ex.getMessage()); 909 } 910 } 911 912 protected void evaluateDocuments(Document persDoc, 913 Document cleanDoc, Document markedDoc, 914 File errDir) 915 throws ResourceInstantiationException { 916 if (cleanDoc == null && markedDoc == null) 917 return; 918 919 //we've got no types to compare 920 if (annotTypes == null || annotTypes.isEmpty()) 921 return; 922 923 if (cleanDoc != null && !isMarkedStored) { 924 925 processDocument(cleanDoc); 926 927 928 int wordCount = countWords(cleanDoc); 929 if(wordCount == 0) 930 Out.prln("<BR>No Token annotations to count words in the document."); 931 else 932 Out.prln("<BR>Word count: " + wordCount); 933 corpusWordCount += wordCount; 934 935 if(!isMarkedClean) 936 evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir); 937 else 938 evaluateTwoDocs(markedDoc, cleanDoc, errDir); 939 940 } else 941 evaluateTwoDocs(markedDoc, persDoc, errDir); 942 943 } 944 945 /** 946 * Count all Token.kind=word annotations in the document 947 */ 948 protected int countWords(Document annotDoc) { 949 int count = 0; 950 951 if (annotDoc == null) return 0; 952 // check for Token in outputSetName 953 AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token"); 954 if (tokens == null) return 0; 955 956 Iterator it = tokens.iterator(); 957 Annotation currAnnotation; 958 while (it.hasNext()) { 959 currAnnotation = (Annotation) it.next(); 960 Object feature = currAnnotation.getFeatures().get("kind"); 961 if(feature != null && "word".equalsIgnoreCase((String)feature)) ++count; 962 } // while 963 964 return count; 965 } 966 967 protected void evaluateAllThree(Document persDoc, 968 Document cleanDoc, Document markedDoc, 969 File errDir) 970 throws ResourceInstantiationException { 971 //first start the table and its header 972 printTableHeader(); 973 974 // store annotation diff in .err file 975 FileWriter errFileWriter = null; 976 if (isMoreInfoMode && errDir != null) { 977 StringBuffer docName = new StringBuffer(cleanDoc.getName()); 978 docName.replace( 979 cleanDoc.getName().lastIndexOf("."), 980 docName.length(), 981 ".err"); 982 File errFile = new File(errDir, docName.toString()); 983 try { 984 errFileWriter = new FileWriter(errFile, false); 985 } 986 catch (Exception ex) { 987 Out.prln("Exception when creating the error file " + errFile + ": " 988 + ex.getMessage()); 989 errFileWriter = null; 990 } 991 } 992 993 for (int jj= 0; jj< annotTypes.size(); jj++) { 994 String annotType = (String) annotTypes.get(jj); 995 996 AnnotationDiff annotDiff=measureDocs(markedDoc, cleanDoc, annotType); 997 //we don't have this annotation type in this document 998 if (annotDiff == null) 999 continue; 1000 1001 //increase the number of processed documents 1002 docNumber++; 1003 //add precison and recall to the sums 1004 updateStatistics(annotDiff, annotType); 1005 1006 AnnotationDiff annotDiff1 = 1007 measureDocs(markedDoc, persDoc, annotType); 1008 1009 Out.prln("<TR>"); 1010 1011 if(isMoreInfoMode && annotDiff1 != null 1012 && (annotDiff1.getPrecisionAverage() != annotDiff.getPrecisionAverage() 1013 || annotDiff1.getRecallAverage() != annotDiff.getRecallAverage()) 1014 ) 1015 Out.prln("<TD> " + annotType + "_new"+ "</TD>"); 1016 else 1017 Out.prln("<TD> " + annotType + "</TD>"); 1018 1019 if (isMoreInfoMode) { 1020 if(annotDiff1 != null) updateStatisticsProc(annotDiff1, annotType); 1021 1022 Out.prln("<TD>" + annotDiff.getCorrectCount() + "</TD>"); 1023 Out.prln("<TD>" + annotDiff.getPartiallyCorrectCount() + "</TD>"); 1024 Out.prln("<TD>" + annotDiff.getMissingCount() + "</TD>"); 1025 Out.prln("<TD>" + annotDiff.getSpuriousCount() + "</TD>"); 1026 } 1027 1028 Out.prln("<TD>"); 1029 1030 //check the precision first 1031 if (annotDiff1 != null) { 1032 1033 if (annotDiff1.getPrecisionAverage() 1034 < annotDiff.getPrecisionAverage()) { 1035 Out.prln("<P><Font color=blue> "); 1036 Out.prln(annotDiff.getPrecisionAverage()); 1037 1038 if(!isMoreInfoMode) { 1039 Out.pr("<BR>Precision increase on human-marked from "); 1040 Out.pr(annotDiff1.getPrecisionAverage() + " to "); 1041 Out.prln(annotDiff.getPrecisionAverage()); 1042 } 1043 Out.prln(" </Font></P>"); 1044 } 1045 else if (annotDiff1.getPrecisionAverage() 1046 > annotDiff.getPrecisionAverage()) { 1047 Out.prln("<P><Font color=red> "); 1048 Out.prln(annotDiff.getPrecisionAverage()); 1049 1050 if(!isMoreInfoMode) { 1051 Out.pr("<BR>Precision decrease on human-marked from "); 1052 Out.pr(annotDiff1.getPrecisionAverage() + " to "); 1053 Out.prln(annotDiff.getPrecisionAverage()); 1054 } 1055 Out.prln(" </Font></P>"); 1056 } 1057 else 1058 Out.prln("<P> " + annotDiff.getPrecisionAverage() + " </P>"); 1059 } 1060 else 1061 Out.prln("<P> " + annotDiff.getPrecisionAverage() + " </P>"); 1062 1063 Out.prln("</TD>"); 1064 1065 Out.prln("<TD>"); 1066 1067 //check the recall now 1068 if (annotDiff1 != null) { 1069 1070 if (annotDiff1.getRecallAverage() < annotDiff.getRecallAverage()) { 1071 Out.prln("<P><Font color=blue> "); 1072 Out.prln(annotDiff.getRecallAverage()); 1073 1074 if(!isMoreInfoMode) { 1075 Out.pr("<BR>Recall increase on human-marked from "); 1076 Out.pr(annotDiff1.getRecallAverage() + " to "); 1077 Out.prln(annotDiff.getRecallAverage()); 1078 } 1079 Out.prln(" </Font></P>"); 1080 } 1081 else if (annotDiff1.getRecallAverage() > annotDiff.getRecallAverage()) { 1082 Out.prln("<P><Font color=red> "); 1083 Out.prln(annotDiff.getRecallAverage()); 1084 1085 if(!isMoreInfoMode) { 1086 Out.pr("<BR>Recall decrease on human-marked from "); 1087 Out.pr(annotDiff1.getRecallAverage() + " to "); 1088 Out.prln(annotDiff.getRecallAverage()); 1089 } 1090 Out.prln(" </Font></P>"); 1091 } 1092 else 1093 Out.prln("<P> " + annotDiff.getRecallAverage() + " </P>"); 1094 } else 1095 Out.prln("<P> " + annotDiff.getRecallAverage() + " </P>"); 1096 1097 1098 Out.prln("</TD>"); 1099 1100 //check the recall now 1101 if ( isVerboseMode ) { 1102 Out.prln("<TD>"); 1103 if (annotDiff.getRecallAverage() < threshold) { 1104 printAnnotations(annotDiff, markedDoc, cleanDoc); 1105 } 1106 else { 1107 Out.prln(" "); 1108 } 1109 Out.prln("</TD>"); 1110 } 1111 1112 Out.prln("</TR>"); 1113 1114 // show one more table line for processed document 1115 if(isMoreInfoMode && annotDiff1 != null 1116 && (annotDiff1.getPrecisionAverage() != annotDiff.getPrecisionAverage() 1117 || annotDiff1.getRecallAverage() != annotDiff.getRecallAverage()) 1118 ) { 1119 1120 Out.prln("<TR>"); 1121 Out.prln("<TD> " + annotType + "_old" + "</TD>"); 1122 1123 Out.prln("<TD>" + annotDiff1.getCorrectCount() + "</TD>"); 1124 Out.prln("<TD>" + annotDiff1.getPartiallyCorrectCount() + "</TD>"); 1125 Out.prln("<TD>" + annotDiff1.getMissingCount() + "</TD>"); 1126 Out.prln("<TD>" + annotDiff1.getSpuriousCount() + "</TD>"); 1127 1128 Out.prln("<TD>"); 1129 if (annotDiff1.getPrecisionAverage() < annotDiff.getPrecisionAverage()) 1130 1131 Out.prln("<P><Font color=blue> " + annotDiff1.getPrecisionAverage() 1132 + "</Font></P>"); 1133 else if (annotDiff1.getPrecisionAverage() > annotDiff.getPrecisionAverage()) 1134 Out.prln( 1135 "<P><Font color=red> " + annotDiff1.getPrecisionAverage() 1136 + " </Font></P>"); 1137 else 1138 Out.prln(annotDiff1.getPrecisionAverage()); 1139 1140 Out.prln("</TD>"); 1141 1142 Out.prln("<TD>"); 1143 if (annotDiff1.getRecallAverage() < annotDiff.getRecallAverage()) 1144 Out.prln("<P><Font color=blue> " + annotDiff1.getRecallAverage() 1145 + " </Font></P>"); 1146 else if (annotDiff1.getRecallAverage() > annotDiff.getRecallAverage()) 1147 Out.prln("<P><Font color=red> " + annotDiff1.getRecallAverage() 1148 + " </Font></P>"); 1149 else 1150 Out.prln(annotDiff1.getRecallAverage()); 1151 1152 Out.prln("</TD>"); 1153 1154 //check the recall now 1155 if ( isVerboseMode ) { 1156 // create error file and start writing 1157 1158 Out.prln("<TD>"); 1159 if (annotDiff.getRecallAverage() < threshold) { 1160 printAnnotations(annotDiff, markedDoc, cleanDoc); 1161 } 1162 else { 1163 Out.prln(" "); 1164 } 1165 Out.prln("</TD>"); 1166 } 1167 Out.prln("</TR>"); 1168 } // if(isMoreInfoMode && annotDiff1 != null) 1169 1170 if (isMoreInfoMode && errDir != null) 1171 storeAnnotations(annotType, annotDiff, markedDoc, cleanDoc, errFileWriter); 1172 }//for loop through annotation types 1173 Out.prln("</TABLE>"); 1174 1175 try { 1176 if(errFileWriter != null) 1177 errFileWriter.close(); 1178 } 1179 catch (Exception ex) { 1180 Out.prln("Exception on close of error file " + errFileWriter + ": " 1181 + ex.getMessage()); 1182 } 1183 }//evaluateAllThree 1184 1185 protected void evaluateTwoDocs(Document keyDoc, Document respDoc, 1186 File errDir) 1187 throws ResourceInstantiationException { 1188 1189 //first start the table and its header 1190 printTableHeader(); 1191 1192 // store annotation diff in .err file 1193 FileWriter errFileWriter = null; 1194 if (isMoreInfoMode && errDir != null) { 1195 StringBuffer docName = new StringBuffer(keyDoc.getName()); 1196 docName.replace( 1197 keyDoc.getName().lastIndexOf("."), 1198 docName.length(), 1199 ".err"); 1200 File errFile = new File(errDir, docName.toString()); 1201 try { 1202 errFileWriter = new FileWriter(errFile, false); 1203 } 1204 catch (Exception ex) { 1205 Out.prln("Exception when creating the error file " + errFile + ": " 1206 + ex.getMessage()); 1207 errFileWriter = null; 1208 } 1209 } 1210 1211 for (int jj= 0; jj< annotTypes.size(); jj++) { 1212 String annotType = (String) annotTypes.get(jj); 1213 1214 AnnotationDiff annotDiff=measureDocs(keyDoc, respDoc, annotType); 1215 //we don't have this annotation type in this document 1216 if (annotDiff == null) 1217 continue; 1218 1219 //increase the number of processed documents 1220 docNumber++; 1221 //add precison and recall to the sums 1222 updateStatistics(annotDiff, annotType); 1223 1224 Out.prln("<TR>"); 1225 Out.prln("<TD>" + annotType + "</TD>"); 1226 1227 if(isMoreInfoMode) { 1228 Out.prln("<TD>" + annotDiff.getCorrectCount() + "</TD>"); 1229 Out.prln("<TD>" + annotDiff.getPartiallyCorrectCount() + "</TD>"); 1230 Out.prln("<TD>" + annotDiff.getMissingCount() + "</TD>"); 1231 Out.prln("<TD>" + annotDiff.getSpuriousCount() + "</TD>"); 1232 } 1233 1234 Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>"); 1235 Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>"); 1236 //check the recall now 1237 if ( isVerboseMode ) { 1238 Out.prln("<TD>"); 1239 if (annotDiff.getRecallAverage() < threshold) { 1240 printAnnotations(annotDiff, keyDoc, respDoc); 1241 } 1242 else { 1243 Out.prln(" "); 1244 } 1245 Out.prln("</TD>"); 1246 } 1247 Out.prln("</TR>"); 1248 1249 if (isMoreInfoMode && errDir != null) 1250 storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errFileWriter); 1251 }//for loop through annotation types 1252 Out.prln("</TABLE>"); 1253 1254 try { 1255 if(errFileWriter != null) 1256 errFileWriter.close(); 1257 } 1258 catch (Exception ex) { 1259 Out.prln("Exception on close of error file " + errFileWriter + ": " 1260 + ex.getMessage()); 1261 } 1262 }//evaluateTwoDocs 1263 1264 protected void printTableHeader() { 1265 Out.prln("<TABLE BORDER=1"); 1266 Out.pr("<TR> <TD><B>Annotation Type</B></TD> "); 1267 1268 if(isMoreInfoMode) 1269 Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> " 1270 + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>"); 1271 1272 Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>"); 1273 1274 if (isVerboseMode) 1275 Out.pr("<TD><B>Annotations</B></TD>"); 1276 1277 Out.prln("</TR>"); 1278 } 1279 1280 protected void updateStatistics(AnnotationDiff annotDiff, String annotType){ 1281 precisionSum += annotDiff.getPrecisionAverage(); 1282 recallSum += annotDiff.getRecallAverage(); 1283 fMeasureSum += annotDiff.getFMeasureAverage(); 1284 Double oldPrecision = (Double) precisionByType.get(annotType); 1285 if (oldPrecision == null) 1286 precisionByType.put(annotType, 1287 new Double(annotDiff.getPrecisionAverage())); 1288 else 1289 precisionByType.put(annotType, 1290 new Double(oldPrecision.doubleValue() + 1291 annotDiff.getPrecisionAverage())); 1292 Integer precCount = (Integer) prCountByType.get(annotType); 1293 if (precCount == null) 1294 prCountByType.put(annotType, new Integer(1)); 1295 else 1296 prCountByType.put(annotType, new Integer(precCount.intValue() + 1)); 1297 1298 1299 Double oldFMeasure = (Double) fMeasureByType.get(annotType); 1300 if (oldFMeasure == null) 1301 fMeasureByType.put(annotType, 1302 new Double(annotDiff.getFMeasureAverage())); 1303 else 1304 fMeasureByType.put(annotType, 1305 new Double(oldFMeasure.doubleValue() + 1306 annotDiff.getFMeasureAverage())); 1307 Integer fCount = (Integer) fMeasureCountByType.get(annotType); 1308 if (fCount == null) 1309 fMeasureCountByType.put(annotType, new Integer(1)); 1310 else 1311 fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1)); 1312 1313 Double oldRecall = (Double) recallByType.get(annotType); 1314 if (oldRecall == null) 1315 recallByType.put(annotType, 1316 new Double(annotDiff.getRecallAverage())); 1317 else 1318 recallByType.put(annotType, 1319 new Double(oldRecall.doubleValue() + 1320 annotDiff.getRecallAverage())); 1321 Integer recCount = (Integer) recCountByType.get(annotType); 1322 if (recCount == null) 1323 recCountByType.put(annotType, new Integer(1)); 1324 else 1325 recCountByType.put(annotType, new Integer(recCount.intValue() + 1)); 1326 1327 //Update the missing, spurious, correct, and partial counts 1328 Long oldMissingNo = (Long) missingByType.get(annotType); 1329 if (oldMissingNo == null) 1330 missingByType.put(annotType, new Long(annotDiff.getMissingCount())); 1331 else 1332 missingByType.put(annotType, 1333 new Long(oldMissingNo.longValue() + 1334 annotDiff.getMissingCount())); 1335 1336 Long oldCorrectNo = (Long) correctByType.get(annotType); 1337 if (oldCorrectNo == null) 1338 correctByType.put(annotType, new Long(annotDiff.getCorrectCount())); 1339 else 1340 correctByType.put(annotType, 1341 new Long(oldCorrectNo.longValue() + 1342 annotDiff.getCorrectCount())); 1343 1344 Long oldPartialNo = (Long) partialByType.get(annotType); 1345 if (oldPartialNo == null) 1346 partialByType.put(annotType, new Long(annotDiff.getPartiallyCorrectCount())); 1347 else 1348 partialByType.put(annotType, 1349 new Long(oldPartialNo.longValue() + 1350 annotDiff.getPartiallyCorrectCount())); 1351 1352 Long oldSpuriousNo = (Long) spurByType.get(annotType); 1353 if (oldSpuriousNo == null) 1354 spurByType.put(annotType, new Long(annotDiff.getSpuriousCount())); 1355 else 1356 spurByType.put(annotType, 1357 new Long(oldSpuriousNo.longValue() + 1358 annotDiff.getSpuriousCount())); 1359 } 1360 1361 /** 1362 * Update statistics for processed documents 1363 * The same procedure as updateStatistics with different hashTables 1364 */ 1365 protected void updateStatisticsProc(AnnotationDiff annotDiff, String annotType){ 1366 hasProcessed = true; 1367 proc_precisionSum += annotDiff.getPrecisionAverage(); 1368 proc_recallSum += annotDiff.getRecallAverage(); 1369 proc_fMeasureSum += annotDiff.getFMeasureAverage(); 1370 Double oldPrecision = (Double) proc_precisionByType.get(annotType); 1371 if (oldPrecision == null) 1372 proc_precisionByType.put(annotType, 1373 new Double(annotDiff.getPrecisionAverage())); 1374 else 1375 proc_precisionByType.put(annotType, 1376 new Double(oldPrecision.doubleValue() + 1377 annotDiff.getPrecisionAverage())); 1378 Integer precCount = (Integer) proc_prCountByType.get(annotType); 1379 if (precCount == null) 1380 proc_prCountByType.put(annotType, new Integer(1)); 1381 else 1382 proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1)); 1383 1384 1385 Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType); 1386 if (oldFMeasure == null) 1387 proc_fMeasureByType.put(annotType, 1388 new Double(annotDiff.getFMeasureAverage())); 1389 else 1390 proc_fMeasureByType.put(annotType, 1391 new Double(oldFMeasure.doubleValue() + 1392 annotDiff.getFMeasureAverage())); 1393 Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType); 1394 if (fCount == null) 1395 proc_fMeasureCountByType.put(annotType, new Integer(1)); 1396 else 1397 proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1)); 1398 1399 Double oldRecall = (Double) proc_recallByType.get(annotType); 1400 if (oldRecall == null) 1401 proc_recallByType.put(annotType, 1402 new Double(annotDiff.getRecallAverage())); 1403 else 1404 proc_recallByType.put(annotType, 1405 new Double(oldRecall.doubleValue() + 1406 annotDiff.getRecallAverage())); 1407 Integer recCount = (Integer) proc_recCountByType.get(annotType); 1408 if (recCount == null) 1409 proc_recCountByType.put(annotType, new Integer(1)); 1410 else 1411 proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1)); 1412 1413 //Update the missing, spurious, correct, and partial counts 1414 Long oldMissingNo = (Long) proc_missingByType.get(annotType); 1415 if (oldMissingNo == null) 1416 proc_missingByType.put(annotType, new Long(annotDiff.getMissingCount())); 1417 else 1418 proc_missingByType.put(annotType, 1419 new Long(oldMissingNo.longValue() + 1420 annotDiff.getMissingCount())); 1421 1422 Long oldCorrectNo = (Long) proc_correctByType.get(annotType); 1423 if (oldCorrectNo == null) 1424 proc_correctByType.put(annotType, new Long(annotDiff.getCorrectCount())); 1425 else 1426 proc_correctByType.put(annotType, 1427 new Long(oldCorrectNo.longValue() + 1428 annotDiff.getCorrectCount())); 1429 1430 Long oldPartialNo = (Long) proc_partialByType.get(annotType); 1431 if (oldPartialNo == null) 1432 proc_partialByType.put(annotType, new Long(annotDiff.getPartiallyCorrectCount())); 1433 else 1434 proc_partialByType.put(annotType, 1435 new Long(oldPartialNo.longValue() + 1436 annotDiff.getPartiallyCorrectCount())); 1437 1438 Long oldSpuriousNo = (Long) proc_spurByType.get(annotType); 1439 if (oldSpuriousNo == null) 1440 proc_spurByType.put(annotType, new Long(annotDiff.getSpuriousCount())); 1441 else 1442 proc_spurByType.put(annotType, 1443 new Long(oldSpuriousNo.longValue() + 1444 annotDiff.getSpuriousCount())); 1445 } 1446 1447 public void printStatistics() { 1448 1449 Out.prln("<H2> Statistics </H2>"); 1450 1451/* 1452 Out.prln("<H3> Precision </H3>"); 1453 if (precisionByType != null && !precisionByType.isEmpty()) { 1454 Iterator iter = precisionByType.keySet().iterator(); 1455 while (iter.hasNext()) { 1456 String annotType = (String) iter.next(); 1457 Out.prln(annotType + ": " 1458 + ((Double)precisionByType.get(annotType)).doubleValue() 1459 / 1460 ((Integer)prCountByType.get(annotType)).intValue() 1461 + "<P>"); 1462 }//while 1463 } 1464 Out.prln("Overall precision: " + getPrecisionAverage() + "<P>"); 1465 1466 Out.prln("<H3> Recall </H3>"); 1467 if (recallByType != null && !recallByType.isEmpty()) { 1468 Iterator iter = recallByType.keySet().iterator(); 1469 while (iter.hasNext()) { 1470 String annotType = (String) iter.next(); 1471 Out.prln(annotType + ": " 1472 + ((Double)recallByType.get(annotType)).doubleValue() 1473 / 1474 ((Integer)recCountByType.get(annotType)).intValue() 1475 + "<P>"); 1476 }//while 1477 } 1478 1479 Out.prln("Overall recall: " + getRecallAverage() 1480 + "<P>"); 1481*/ 1482 if (annotTypes == null) { 1483 Out.prln("No types given for evaluation, cannot obtain precision/recall"); 1484 return; 1485 } 1486 Out.prln("<table border=1>"); 1487 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" + 1488 "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" + 1489 "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" + 1490 "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>"); 1491 String annotType; 1492 for (int i = 0; i < annotTypes.size(); i++) { 1493 annotType = (String) annotTypes.get(i); 1494 printStatsForType(annotType); 1495 }//for 1496 Out.prln("</table>"); 1497 } // updateStatisticsProc 1498 1499 protected void printStatsForType(String annotType){ 1500 long correct = (correctByType.get(annotType) == null)? 0 : 1501 ((Long)correctByType.get(annotType)).longValue(); 1502 long partial = (partialByType.get(annotType) == null)? 0 : 1503 ((Long)partialByType.get(annotType)).longValue(); 1504 long spurious = (spurByType.get(annotType) == null)? 0 : 1505 ((Long)spurByType.get(annotType)).longValue(); 1506 long missing = (missingByType.get(annotType) == null)? 0: 1507 ((Long)missingByType.get(annotType)).longValue(); 1508 long actual = correct + partial + spurious; 1509 long possible = correct + partial + missing; 1510 //precision strict is correct/actual 1511 //precision is (correct + 0.5 * partially correct)/actual 1512 double precision = (correct + 0.5 * partial) / actual; 1513 //recall strict is correct/possible 1514 double recall = (correct + 0.5*partial)/possible; 1515 //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R) 1516 double fmeasure = 1517 ((beta*beta + 1)*precision*recall) 1518 / 1519 ((beta*beta*precision) + recall); 1520 1521 long proc_correct=0; 1522 long proc_partial=0; 1523 long proc_spurious=0; 1524 long proc_missing=0; 1525 long proc_actual=0; 1526 long proc_possible=0; 1527 double proc_precision=0; 1528 double proc_recall=0; 1529 double proc_fmeasure=0; 1530 1531 if(hasProcessed) { 1532 // calculate values for processed 1533 proc_correct = (proc_correctByType.get(annotType) == null)? 0 : 1534 ((Long)proc_correctByType.get(annotType)).longValue(); 1535 proc_partial = (proc_partialByType.get(annotType) == null)? 0 : 1536 ((Long)proc_partialByType.get(annotType)).longValue(); 1537 proc_spurious = (proc_spurByType.get(annotType) == null)? 0 : 1538 ((Long)proc_spurByType.get(annotType)).longValue(); 1539 proc_missing = (proc_missingByType.get(annotType) == null)? 0: 1540 ((Long)proc_missingByType.get(annotType)).longValue(); 1541 proc_actual = proc_correct + proc_partial + proc_spurious; 1542 proc_possible = proc_correct + proc_partial + proc_missing; 1543 //precision strict is correct/actual 1544 //precision is (correct + 0.5 * partially correct)/actual 1545 proc_precision = (proc_correct + 0.5*proc_partial)/proc_actual; 1546 //recall strict is correct/possible 1547 proc_recall = (proc_correct + 0.5*proc_partial)/proc_possible; 1548 //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R) 1549 proc_fmeasure = 1550 ((beta*beta + 1)*proc_precision*proc_recall) 1551 / 1552 ((beta*beta*proc_precision) + proc_recall); 1553 } 1554 1555 // output data 1556 Out.prln("<TR>"); 1557 if(hasProcessed) 1558 Out.prln("<TD>" + annotType+ "_new" + "</TD>"); 1559 else 1560 Out.prln("<TD>" + annotType + "</TD>"); 1561 1562 Out.prln("<TD>" + correct + "</TD>"); 1563 Out.prln("<TD>" + partial + "</TD>"); 1564 Out.prln("<TD>" + missing + "</TD>"); 1565 Out.prln("<TD>" + spurious + "</TD>"); 1566 1567 if(hasProcessed && (precision < proc_precision)) 1568 Out.prln("<TD><Font color=red>" + precision + "</TD>"); 1569 else if(hasProcessed && (precision > proc_precision)) 1570 Out.prln("<TD><Font color=blue>" + precision + "</TD>"); 1571 else 1572 Out.prln("<TD>" + precision + "</TD>"); 1573 if(hasProcessed && (recall < proc_recall)) 1574 Out.prln("<TD><Font color=red>" + recall + "</TD>"); 1575 else if(hasProcessed && (recall > proc_recall)) 1576 Out.prln("<TD><Font color=blue>" + recall + "</TD>"); 1577 else 1578 Out.prln("<TD>" + recall + "</TD>"); 1579 Out.prln("<TD>" + fmeasure + "</TD>"); 1580 Out.prln("</TR>"); 1581 1582 if(hasProcessed) { 1583 // output data 1584 Out.prln("<TR>"); 1585 Out.prln("<TD>" + annotType + "_old" + "</TD>"); 1586 1587 Out.prln("<TD>" + proc_correct + "</TD>"); 1588 Out.prln("<TD>" + proc_partial + "</TD>"); 1589 Out.prln("<TD>" + proc_missing + "</TD>"); 1590 Out.prln("<TD>" + proc_spurious + "</TD>"); 1591 1592 if(precision < proc_precision) 1593 Out.prln("<TD><Font color=red>" + proc_precision + "</TD>"); 1594 else if(precision > proc_precision) 1595 Out.prln("<TD><Font color=blue>" + proc_precision + "</TD>"); 1596 else 1597 Out.prln("<TD>" + proc_precision + "</TD>"); 1598 if(recall < proc_recall) 1599 Out.prln("<TD><Font color=red>" + proc_recall + "</TD>"); 1600 else if(recall > proc_recall) 1601 Out.prln("<TD><Font color=blue>" + proc_recall + "</TD>"); 1602 else 1603 Out.prln("<TD>" + proc_recall + "</TD>"); 1604 Out.prln("<TD>" + proc_fmeasure + "</TD>"); 1605 Out.prln("</TR>"); 1606 } 1607 }//printStatsForType 1608 1609 protected AnnotationDiff measureDocs( 1610 Document keyDoc, Document respDoc, String annotType) 1611 throws ResourceInstantiationException { 1612 1613 if (keyDoc == null || respDoc == null) 1614 return null; 1615 1616 if (annotSetName != null 1617 && keyDoc.getAnnotations(annotSetName).get(annotType) == null) 1618 return null; 1619 else if ((annotSetName == null || annotSetName.equals("")) 1620 && keyDoc.getAnnotations().get(annotType) == null) 1621 return null; 1622 1623 // create the annotation schema needed for AnnotationDiff 1624 AnnotationSchema annotationSchema = new AnnotationSchema(); 1625 1626 // set annotation type 1627 annotationSchema.setAnnotationName(annotType); 1628 // create an annotation diff 1629 AnnotationDiff annotDiff = new AnnotationDiff(); 1630 annotDiff.setTextMode(new Boolean(true)); 1631 annotDiff.setAnnotationSchema(annotationSchema); 1632 annotDiff.setKeyDocument(keyDoc); 1633 annotDiff.setResponseDocument(respDoc); 1634 annotDiff.setKeyAnnotationSetName(annotSetName); 1635 annotDiff.setResponseAnnotationSetName(outputSetName); 1636 // set feature names set for annotation diff 1637 annotDiff.setKeyFeatureNamesSet(diffFeaturesSet); 1638 annotDiff.init(); 1639 1640 return annotDiff; 1641 } // measureDocs 1642 1643 protected void storeAnnotations(String type, AnnotationDiff annotDiff, 1644 Document keyDoc, Document respDoc, FileWriter errFileWriter) { 1645 if(errFileWriter == null) return; // exit on "no file" 1646 1647 try { 1648 // extract and store annotations 1649 Comparator comp = new OffsetComparator(); 1650 TreeSet sortedSet = new TreeSet(comp); 1651 Set missingSet = 1652 annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE); 1653 sortedSet.clear(); 1654 sortedSet.addAll(missingSet); 1655 storeAnnotations(type+".miss", sortedSet, keyDoc, errFileWriter); 1656 Set spuriousSet = 1657 annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE); 1658 sortedSet.clear(); 1659 sortedSet.addAll(spuriousSet); 1660 storeAnnotations(type+".spur", sortedSet, respDoc, errFileWriter); 1661 Set partialSet = 1662 annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE); 1663 sortedSet.clear(); 1664 sortedSet.addAll(partialSet); 1665 storeAnnotations(type+".part", sortedSet, respDoc, errFileWriter); 1666 } catch (Exception ex) { 1667 Out.prln("Exception on close of error file "+errFileWriter+": " 1668 +ex.getMessage()); 1669 } 1670 }// storeAnnotations 1671 1672 protected void storeAnnotations(String type, Set set, Document doc, 1673 FileWriter file) throws IOException{ 1674 1675 if (set == null || set.isEmpty()) 1676 return; 1677 1678 Iterator iter = set.iterator(); 1679 Annotation ann; 1680 while (iter.hasNext()) { 1681 ann = (Annotation) iter.next(); 1682 file.write(type); 1683 file.write("."); 1684 file.write(doc.getContent().toString().substring( 1685 ann.getStartNode().getOffset().intValue(), 1686 ann.getEndNode().getOffset().intValue())); 1687 file.write("."); 1688 file.write(ann.getStartNode().getOffset().toString()); 1689 file.write("."); 1690 file.write(ann.getEndNode().getOffset().toString()); 1691 file.write("\n"); 1692 }//while 1693 }// storeAnnotations 1694 1695 protected void printAnnotations(AnnotationDiff annotDiff, 1696 Document keyDoc, Document respDoc) { 1697 Out.pr("MISSING ANNOTATIONS in the automatic texts: "); 1698 Set missingSet = 1699 annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE); 1700 printAnnotations(missingSet, keyDoc); 1701 Out.prln("<BR>"); 1702 1703 Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: "); 1704 Set spuriousSet = 1705 annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE); 1706 printAnnotations(spuriousSet, respDoc); 1707 Out.prln("</BR>"); 1708 1709 Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: "); 1710 Set partialSet = 1711 annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE); 1712 printAnnotations(partialSet, respDoc); 1713 } 1714 1715 protected void printAnnotations(Set set, Document doc) { 1716 if (set == null || set.isEmpty()) 1717 return; 1718 1719 Iterator iter = set.iterator(); 1720 while (iter.hasNext()) { 1721 Annotation ann = (Annotation) iter.next(); 1722 Out.prln( 1723 "<B>" + 1724 doc.getContent().toString().substring( 1725 ann.getStartNode().getOffset().intValue(), 1726 ann.getEndNode().getOffset().intValue()) + 1727 "</B>: <I>[" + ann.getStartNode().getOffset() + 1728 "," + ann.getEndNode().getOffset() + "]</I>" 1729// + "; features" + ann.getFeatures() 1730 ); 1731 }//while 1732 }//printAnnotations 1733 1734 /** 1735 * The directory from which we should generate/evaluate the corpus 1736 */ 1737 private File startDir; 1738 private File currDir; 1739 private static List annotTypes; 1740 1741 private Controller application = null; 1742 private File applicationFile = null; 1743 1744 //collect the sum of all precisions and recalls of all docs 1745 //and the number of docs, so I can calculate the average for 1746 //the corpus at the end 1747 private double precisionSum = 0; 1748 private double recallSum = 0; 1749 private double fMeasureSum = 0; 1750 private HashMap precisionByType = new HashMap(); 1751 private HashMap prCountByType = new HashMap(); 1752 private HashMap recallByType = new HashMap(); 1753 private HashMap recCountByType = new HashMap(); 1754 private HashMap fMeasureByType = new HashMap(); 1755 private HashMap fMeasureCountByType = new HashMap(); 1756 1757 private HashMap missingByType = new HashMap(); 1758 private HashMap spurByType = new HashMap(); 1759 private HashMap correctByType = new HashMap(); 1760 private HashMap partialByType = new HashMap(); 1761 1762 // statistic for processed 1763 static boolean hasProcessed = false; 1764 private double proc_precisionSum = 0; 1765 private double proc_recallSum = 0; 1766 private double proc_fMeasureSum = 0; 1767 private HashMap proc_precisionByType = new HashMap(); 1768 private HashMap proc_prCountByType = new HashMap(); 1769 private HashMap proc_recallByType = new HashMap(); 1770 private HashMap proc_recCountByType = new HashMap(); 1771 private HashMap proc_fMeasureByType = new HashMap(); 1772 private HashMap proc_fMeasureCountByType = new HashMap(); 1773 1774 private HashMap proc_missingByType = new HashMap(); 1775 private HashMap proc_spurByType = new HashMap(); 1776 private HashMap proc_correctByType = new HashMap(); 1777 private HashMap proc_partialByType = new HashMap(); 1778 1779 double beta = 1; 1780 1781 private int docNumber = 0; 1782 1783 /** 1784 * If true, the corpus tool will generate the corpus, otherwise it'll 1785 * run in evaluate mode 1786 */ 1787 private boolean isGenerateMode = false; 1788 1789 /** 1790 * If true - show annotations for docs below threshold 1791 */ 1792 private boolean isVerboseMode = false; 1793 1794 /** 1795 * If true - show more info in document table 1796 */ 1797 private boolean isMoreInfoMode = false; 1798 1799 /** 1800 * The list of features used in the AnnotationDiff separated by comma 1801 * Example: "class;inst" 1802 */ 1803 private Set diffFeaturesSet; 1804 1805 /** 1806 * If true, the corpus tool will evaluate stored against the human-marked 1807 * documents 1808 */ 1809 private boolean isMarkedStored = false; 1810 private boolean isMarkedClean = false; 1811 //whether marked are in a DS, not xml 1812 private boolean isMarkedDS = false; 1813 1814 private String annotSetName = "Key"; 1815 private String outputSetName = null; 1816 1817 private double threshold = 0.5; 1818 private Properties configs = new Properties(); 1819 private static int corpusWordCount = 0; 1820 1821 private String documentEncoding = ""; 1822 1823 /** String to print when wrong command-line args */ 1824 private static String usage = 1825 "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] " 1826 +"[-verbose] [-moreinfo] directory-name application"; 1827 1828}
|
CorpusBenchmarkTool |
|