1
15
16 package gate.util;
17
18 import java.io.*;
19 import java.util.*;
20
21 import gate.*;
22 import gate.util.AnnotationDiffer;
23 import gate.creole.*;
24 import gate.persist.PersistenceException;
25 import gate.persist.SerialDataStore;
26
27 public class CorpusBenchmarkTool {
28 private static final String MARKED_DIR_NAME = "marked";
29 private static final String CLEAN_DIR_NAME = "clean";
30 private static final String CVS_DIR_NAME = "Cvs";
31 private static final String PROCESSED_DIR_NAME = "processed";
32 private static final String ERROR_DIR_NAME = "err";
33
34 private static final boolean DEBUG = true;
35
36 public CorpusBenchmarkTool() {}
37
38 public void initPRs() {
39 try {
40 if (applicationFile == null)
41 Out.prln("Application not set!");
42 Out.prln("App file is: " + applicationFile.getAbsolutePath());
43 application = (Controller) gate.util.persistence.PersistenceManager
44 .loadObjectFromFile(applicationFile);
45 } catch (Exception ex) {
46 throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
47 }
48 }
50 public void unloadPRs() {
51 if (isMarkedStored)
53 return;
54
55 }
56
57 public void execute() {
58
63 execute(startDir);
64
71 }
72
73 public void init() {
74 File propFile = new File("corpus_tool.properties");
76 Out.prln(propFile.getAbsolutePath());
77 if (propFile.exists()) {
78 try {
79 InputStream inputStream = new FileInputStream(propFile);
80 this.configs.load(inputStream);
81 String thresholdString = this.configs.getProperty("threshold");
82 if (thresholdString != null && !thresholdString.equals("")) {
83 this.threshold = (new Double(thresholdString)).doubleValue();
84 Out.prln("New threshold is: " + this.threshold + "<P>\n");
85 }
86 String setName = this.configs.getProperty("annotSetName");
87 if (setName != null && !setName.equals("")) {
88 Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
89 this.annotSetName = setName;
90 }
91 setName = this.configs.getProperty("outputSetName");
92 if (setName != null && !setName.equals("")) {
93 Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
94 this.outputSetName = setName;
95 }
96 String encodingString = this.configs.getProperty("encoding");
97 if (encodingString != null && !encodingString.equals("")) {
98 this.documentEncoding = encodingString;
99 Out.prln("New encoding is: " + this.documentEncoding + "<P>\n");
100 }
101 String types = this.configs.getProperty("annotTypes");
102 if (types != null && !types.equals("")) {
103 Out.prln("Using annotation types from the properties file. <P>\n");
104 StringTokenizer strTok = new StringTokenizer(types, ";");
105 annotTypes = new ArrayList();
106 while (strTok.hasMoreTokens())
107 annotTypes.add(strTok.nextToken());
108 } else {
109 annotTypes = new ArrayList();
110 annotTypes.add("Organization");
111 annotTypes.add("Person");
112 annotTypes.add("Date");
113 annotTypes.add("Location");
114 annotTypes.add("Address");
115 annotTypes.add("Money");
116 annotTypes.add("Percent");
117 annotTypes.add("GPE");
118 annotTypes.add("Facility");
119 }
120 String features = this.configs.getProperty("annotFeatures");
121 HashSet result = new HashSet();
122 if (features != null && !features.equals("")) {
123 Out.pr("Using annotation features from the properties file. \n");
124 java.util.StringTokenizer tok =
125 new java.util.StringTokenizer(features, ";");
126 String current;
127 while(tok.hasMoreTokens()) {
128 current = tok.nextToken();
129 result.add(current);
130 } }
132 diffFeaturesSet = result;
133 Out.prln("Features: "+diffFeaturesSet+" <P>\n");
134
135 } catch (IOException ex) {
136 this.configs = new Properties();
138 }
139 } else
140 this.configs = new Properties();
141
142
143 if (!this.isMarkedStored)
146 initPRs();
147
148 }
149
150 public void execute(File dir) {
151 if (dir == null)
152 return;
153 currDir = dir;
155
156 File processedDir = null;
157 File cleanDir = null;
158 File markedDir = null;
159 File errorDir = null;
160
161 ArrayList subDirs = new ArrayList();
162 File[] dirArray = currDir.listFiles();
163 if(dirArray == null) return;
164 for (int i = 0; i < dirArray.length; i++) {
165 if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
166 continue;
167 if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
168 cleanDir = dirArray[i];
169 else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
170 markedDir = dirArray[i];
171 else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
172 processedDir = dirArray[i];
173 else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
174 errorDir = dirArray[i];
175 else
176 subDirs.add(dirArray[i]);
177 }
178
179 if(cleanDir == null) return;
180 Out.prln("Processing directory: " + currDir + "<P>");
181
182 if (this.isGenerateMode)
183 generateCorpus(cleanDir, processedDir);
184 else
185 evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
186
187 if (subDirs.isEmpty())
189 return;
190
191 for (int j = 0; j < subDirs.size(); j++)
193 execute((File) subDirs.get(j));
194
195 }
197
198 public static void main(String[] args) throws GateException {
199 Out.prln("<HTML>");
200 Out.prln("<HEAD>");
201 Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
202 for(int argC=0; argC < args.length; ++argC)
203 Out.pr(args[argC]+" ");
204 Out.pr(" on " + new Date() + "</TITLE> </HEAD>");
205 Out.prln("<BODY>");
206 Out.prln("Please wait while GATE tools are initialised. <P>");
207 Gate.init();
209
210 CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
211
212 List inputFiles = null;
213 if(args.length < 1) throw new GateException(usage);
214 int i = 0;
215 while (i < args.length && args[i].startsWith("-")) {
216 if(args[i].equals("-generate")) {
217 Out.prln("Generating the corpus... <P>");
218 corpusTool.setGenerateMode(true);
219 } else if (args[i].equals("-marked_clean")) {
220 Out.prln("Evaluating current grammars against human-annotated...<P>");
221 corpusTool.setMarkedClean(true);
222 } else if (args[i].equals("-marked_stored")) {
223 Out.prln("Evaluating stored documents against human-annotated...<P>");
224 corpusTool.setMarkedStored(true);
225 } else if (args[i].equals("-marked_ds")) {
226 Out.prln("Looking for marked docs in a datastore...<P>");
227 corpusTool.setMarkedDS(true);
228 } else if (args[i].equals("-verbose")) {
229 Out.prln("Running in verbose mode. Will generate annotation " +
230 "information when precision/recall are lower than " +
231 corpusTool.getThreshold() +"<P>");
232 corpusTool.setVerboseMode(true);
233 } else if (args[i].equals("-moreinfo")) {
234 Out.prln("Show more details in document table...<P>");
235 corpusTool.setMoreInfo(true);
236 }
237 i++; }
240 String dirName = args[i];
241 File dir = new File(dirName);
242 if (!dir.isDirectory())
243 throw new GateException(usage);
244
245 i++;
247 String appName = args[i];
248 File appFile = new File(appName);
249 if (!appFile.isFile())
250 throw new GateException(usage);
251 else
252 corpusTool.setApplicationFile(appFile);
253
254 corpusTool.init();
255 corpusWordCount = 0;
256
257 Out.prln("Measuring annotaitions of types: " + CorpusBenchmarkTool.annotTypes + "<P>");
258
259 corpusTool.setStartDirectory(dir);
260 corpusTool.execute();
261 if (! corpusTool.getGenerateMode())
264 corpusTool.printStatistics();
265
266 Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
267 Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
268 if(corpusWordCount == 0)
269 Out.prln("<BR>No Token annotations to count words in the corpus.");
270 else
271 Out.prln("<BR>Overall word count: " + corpusWordCount);
272
273
274 if(hasProcessed) {
275 Out.prln("<P>Old Processed: ");
276 Out.prln("<BR>Overall average precision: "
277 + corpusTool.getPrecisionAverageProc());
278 Out.prln("<BR>Overall average recall: "
279 + corpusTool.getRecallAverageProc());
280 }
281 Out.prln("<BR>Finished! <P>");
282 Out.prln("</BODY>");
283 Out.prln("</HTML>");
284
285 System.exit(0);
286
287 }
289 public void setGenerateMode(boolean mode) {
290 isGenerateMode = mode;
291 }
293 public boolean getGenerateMode() {
294 return isGenerateMode;
295 }
297 public boolean getVerboseMode() {
298 return isVerboseMode;
299 }
301 public void setVerboseMode(boolean mode) {
302 isVerboseMode = mode;
303 }
305 public void setMoreInfo(boolean mode) {
306 isMoreInfoMode = mode;
307 }
309 public boolean getMoreInfo() {
310 return isMoreInfoMode;
311 }
313 public void setDiffFeaturesList(Set features) {
314 diffFeaturesSet = features;
315 }
317 public Set getDiffFeaturesList() {
318 return diffFeaturesSet;
319 }
321 public void setMarkedStored(boolean mode) {
322 isMarkedStored = mode;
323 }
325
326 public boolean getMarkedStored() {
327 return isMarkedStored;
328 }
330 public void setMarkedClean(boolean mode) {
331 isMarkedClean = mode;
332 }
334 public boolean getMarkedClean() {
335 return isMarkedClean;
336 }
338 public void setMarkedDS(boolean mode) {
339 isMarkedDS = mode;
340 }
342 public boolean getMarkedDS() {
343 return isMarkedDS;
344 }
346 public void setApplicationFile(File newAppFile) {
347 applicationFile = newAppFile;
348 }
349
350
360 public double getPrecisionAverage() {
361 return (double)precisionSum/docNumber;
362 }
363
364
374 public double getRecallAverage() {
375 return (double)recallSum/docNumber;
376 }
377
378
379 public double getPrecisionAverageProc() {
380 return (double)proc_precisionSum/docNumber;
381 }
382 public double getRecallAverageProc() {
383 return (double)proc_recallSum/docNumber;
384 }
385
386
387 public boolean isGenerateMode() {
388 return isGenerateMode == true;
389 }
391 public double getThreshold() {
392 return threshold;
393 }
394
395 public void setThreshold(double newValue) {
396 threshold = newValue;
397 }
398
399 public File getStartDirectory() {
400 return startDir;
401 }
403 public void setStartDirectory(File dir) {
404 startDir = dir;
405 }
407 protected void generateCorpus(File fileDir, File outputDir) {
408 if (fileDir == null)
410 return;
411 File outDir = outputDir;
413 if (outputDir == null) {
414 outDir = new File(currDir, PROCESSED_DIR_NAME);
415 } else {
416 if (!Files.rmdir(outDir))
418 Out.prln("cannot delete old output directory: " + outDir);
419 }
420 outDir.mkdir();
421
422 try {
424 SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
425 sds.create();
426 sds.open();
427
428 File[] files = fileDir.listFiles();
429 for (int i=0; i < files.length; i++) {
430 if (!files[i].isFile())
431 continue;
432 Out.prln("Processing and storing document: " + files[i].toURL() +"<P>");
434
435 FeatureMap params = Factory.newFeatureMap();
436 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
437 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
438
439 FeatureMap features = Factory.newFeatureMap();
440
442 Document doc = (Document) Factory.createResource(
444 "gate.corpora.DocumentImpl", params, features
445 );
446
447 doc.setName(files[i].getName());
448 if (doc == null)
449 continue;
450 processDocument(doc);
451 LanguageResource lr = sds.adopt(doc, null);
452 sds.sync(lr);
453 Factory.deleteResource(doc);
454 Factory.deleteResource(lr);
455 } sds.close();
457 } catch (java.net.MalformedURLException ex) {
458 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
459 } catch (PersistenceException ex1) {
460 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
461 } catch (ResourceInstantiationException ex2) {
462 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
463 } catch (gate.security.SecurityException ex3) {
464 throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage());
465 }
466 }
468 protected void evaluateCorpus(File fileDir,
469 File processedDir, File markedDir,
470 File errorDir) {
471 if (fileDir == null || !fileDir.exists())
473 return;
474 if (processedDir == null || !processedDir.exists())
475 if (isMarkedStored) {
477 Out.prln("Cannot evaluate because no processed documents exist.");
478 return;
479 }
480 else
481 isMarkedClean = true;
482
483 File errDir = null;
485 if(isMoreInfoMode) {
486 errDir = errorDir;
487 if (errDir == null) {
488 errDir = new File(currDir, ERROR_DIR_NAME);
489 }
490 else {
491 if (!Files.rmdir(errDir))
493 Out.prln("cannot delete old error directory: " + errDir);
494 }
495 Out.prln("Create error directory: " + errDir + "<BR><BR>");
496 errDir.mkdir();
497 }
498
499 boolean processMarked = markedDir != null && markedDir.exists();
501 if (!processMarked && (isMarkedStored || isMarkedClean)) {
502 Out.prln("Cannot evaluate because no human-annotated documents exist.");
503 return;
504 }
505
506 if (isMarkedStored) {
507 evaluateMarkedStored(markedDir, processedDir, errDir);
508 return;
509 } else if (isMarkedClean) {
510 evaluateMarkedClean(markedDir, fileDir, errDir);
511 return;
512 }
513
514 Document persDoc = null;
515 Document cleanDoc = null;
516 Document markedDoc = null;
517
518 try {
520 DataStore sds = Factory.openDataStore
522 ("gate.persist.SerialDataStore",
523 processedDir.toURL().toExternalForm());
524
525 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
526 for (int i=0; i < lrIDs.size(); i++) {
527 String docID = (String) lrIDs.get(i);
528
529 FeatureMap features = Factory.newFeatureMap();
531 features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
532 features.put(DataStore.LR_ID_FEATURE_NAME, docID);
533 FeatureMap hparams = Factory.newFeatureMap();
534
536 persDoc = (Document) Factory.createResource(
537 "gate.corpora.DocumentImpl",
538 features, hparams);
539
540
541 if(isMoreInfoMode) {
542 StringBuffer errName = new StringBuffer(persDoc.getName());
543 errName.replace(
544 persDoc.getName().lastIndexOf("."),
545 persDoc.getName().length(),
546 ".err");
547 Out.prln("<H2>" +
548 "<a href=\"err/" + errName.toString() + "\">"
549 + persDoc.getName() + "</a>" + "</H2>");
550 } else
551 Out.prln("<H2>" + persDoc.getName() + "</H2>");
552
553 File cleanDocFile = new File(fileDir, persDoc.getName());
554 if (! cleanDocFile.exists()) {
556 Out.prln("Warning: Cannot find original document " +
557 persDoc.getName() + " in " + fileDir);
558 } else {
559 FeatureMap params = Factory.newFeatureMap();
560 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
561 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
562
563 cleanDoc = (Document) Factory.createResource(
565 "gate.corpora.DocumentImpl", params, hparams);
566 cleanDoc.setName(persDoc.getName());
567 }
568
569 StringBuffer docName = new StringBuffer(persDoc.getName());
571 if (! isMarkedDS) {
572 docName.replace(
573 persDoc.getName().lastIndexOf("."),
574 docName.length(),
575 ".xml");
576 File markedDocFile = new File(markedDir, docName.toString());
577 if (! processMarked || ! markedDocFile.exists()) {
578 Out.prln("Warning: Cannot find human-annotated document " +
579 markedDocFile + " in " + markedDir);
580 } else {
581 FeatureMap params = Factory.newFeatureMap();
582 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
583 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
584
585 markedDoc = (Document) Factory.createResource(
587 "gate.corpora.DocumentImpl", params, hparams);
588 markedDoc.setName(persDoc.getName());
589 }
590 } else {
591 DataStore sds1 = Factory.openDataStore
594 ("gate.persist.SerialDataStore",
595 markedDir.toURL().toExternalForm());
596
597 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
598 boolean found = false;
599 int k = 0;
600 while (k < lrIDs1.size() && !found) {
602 String docID1 = (String) lrIDs1.get(k);
603
604 FeatureMap features1 = Factory.newFeatureMap();
606 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
607 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
608 Document tempDoc = (Document) Factory.createResource(
609 "gate.corpora.DocumentImpl",
610 features1, hparams);
611 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
613 endsWith(persDoc.getName())) {
614 found = true;
615 markedDoc = tempDoc;
616 } else k++;
617 }
618 }
619
620 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
621 if (persDoc != null)
622 Factory.deleteResource(persDoc);
623 if (cleanDoc != null)
624 Factory.deleteResource(cleanDoc);
625 if (markedDoc != null)
626 Factory.deleteResource(markedDoc);
627
628 } sds.close();
630 } catch (java.net.MalformedURLException ex) {
631 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
632 } catch (PersistenceException ex1) {
633 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
634 } catch (ResourceInstantiationException ex2) {
635 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
636 }
637
638 }
640 protected void evaluateMarkedStored(File markedDir, File storedDir, File errDir) {
641 Document persDoc = null;
642 Document cleanDoc = null;
643 Document markedDoc = null;
644
645 try {
647 DataStore sds = Factory.openDataStore
649 ("gate.persist.SerialDataStore",
650 storedDir.toURL().toExternalForm());
651
652 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
653 for (int i=0; i < lrIDs.size(); i++) {
654 String docID = (String) lrIDs.get(i);
655
656 FeatureMap features = Factory.newFeatureMap();
658 features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
659 features.put(DataStore.LR_ID_FEATURE_NAME, docID);
660
661 FeatureMap hparams = Factory.newFeatureMap();
662
664
665 persDoc = (Document) Factory.createResource(
666 "gate.corpora.DocumentImpl",
667 features, hparams);
668
669 if(isMoreInfoMode) {
670 StringBuffer errName = new StringBuffer(persDoc.getName());
671 errName.replace(
672 persDoc.getName().lastIndexOf("."),
673 persDoc.getName().length(),
674 ".err");
675 Out.prln("<H2>" +
676 "<a href=\"err/" + errName.toString() + "\">"
677 + persDoc.getName() + "</a>" + "</H2>");
678 } else
679 Out.prln("<H2>" + persDoc.getName() + "</H2>");
680
681 if (! this.isMarkedDS) { StringBuffer docName = new StringBuffer(persDoc.getName());
683 docName.replace(
684 persDoc.getName().lastIndexOf("."),
685 docName.length(),
686 ".xml");
687 File markedDocFile = new File(markedDir, docName.toString());
688 if (! markedDocFile.exists()) {
689 Out.prln("Warning: Cannot find human-annotated document " +
690 markedDocFile + " in " + markedDir);
691 } else {
692 FeatureMap params = Factory.newFeatureMap();
693 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
694 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
695
696 markedDoc = (Document) Factory.createResource(
698 "gate.corpora.DocumentImpl", params, hparams);
699 markedDoc.setName(persDoc.getName());
700 } } else {
702 try {
703 DataStore sds1 = Factory.openDataStore
706 ("gate.persist.SerialDataStore",
707 markedDir.toURL().toExternalForm());
708
709 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
710 boolean found = false;
711 int k = 0;
712 while (k < lrIDs1.size() && !found) {
714 String docID1 = (String) lrIDs1.get(k);
715
716 FeatureMap features1 = Factory.newFeatureMap();
718 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
719 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
720 Document tempDoc = (Document) Factory.createResource(
721 "gate.corpora.DocumentImpl",
722 features1, hparams);
723 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
725 endsWith(persDoc.getName())) {
726 found = true;
727 markedDoc = tempDoc;
728 } else k++;
729 }
730 } catch (java.net.MalformedURLException ex) {
731 Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
732 } catch (gate.persist.PersistenceException ex1) {
733 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
734 } catch (gate.creole.ResourceInstantiationException ex2) {
735 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
736 }
737 }
738
739 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
740 if (persDoc != null)
741 Factory.deleteResource(persDoc);
742 if (markedDoc != null)
743 Factory.deleteResource(markedDoc);
744
745 } sds.close();
747
748 } catch (java.net.MalformedURLException ex) {
749 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
750 } catch (PersistenceException ex1) {
751 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
752 } catch (ResourceInstantiationException ex2) {
753 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
754 }
755
756 }
758
759 protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
760 Document persDoc = null;
761 Document cleanDoc = null;
762 Document markedDoc = null;
763
764 File[] cleanDocs = cleanDir.listFiles();
765 for (int i = 0; i< cleanDocs.length; i++) {
766 if (!cleanDocs[i].isFile())
767 continue;
768
769 FeatureMap params = Factory.newFeatureMap();
771 try {
772 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
773 } catch (java.net.MalformedURLException ex) {
774 Out.prln("Cannot create document from file: " +
775 cleanDocs[i].getAbsolutePath());
776 continue;
777 }
778 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
779
780 FeatureMap hparams = Factory.newFeatureMap();
781
783 try {
785 cleanDoc = (Document) Factory.createResource(
786 "gate.corpora.DocumentImpl", params, hparams, cleanDocs[i].getName());
787 } catch (gate.creole.ResourceInstantiationException ex) {
788 Out.prln("Cannot create document from file: " +
789 cleanDocs[i].getAbsolutePath());
790 continue;
791 }
792
793 if(isMoreInfoMode) {
794 StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
795 errName.replace(
796 cleanDocs[i].getName().lastIndexOf("."),
797 cleanDocs[i].getName().length(),
798 ".err");
799 Out.prln("<H2>" +
800 "<a href=\"err/" + errName.toString() + "\">"
801 + cleanDocs[i].getName() + "</a>" + "</H2>");
802 } else
803 Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>");
804
805 if (! isMarkedDS) {
807 StringBuffer docName = new StringBuffer(cleanDoc.getName());
808 docName.replace(
809 cleanDoc.getName().lastIndexOf("."),
810 docName.length(),
811 ".xml");
812 File markedDocFile = new File(markedDir, docName.toString());
813 if (! markedDocFile.exists()) {
814 Out.prln("Warning: Cannot find human-annotated document " +
815 markedDocFile + " in " + markedDir);
816 continue;
817 } else {
818 params = Factory.newFeatureMap();
819 try {
820 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
821 } catch (java.net.MalformedURLException ex) {
822 Out.prln("Cannot create document from file: " +
823 markedDocFile.getAbsolutePath());
824 continue;
825 }
826 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
827
828 try {
830 markedDoc = (Document) Factory.createResource(
831 "gate.corpora.DocumentImpl", params,
832 hparams, cleanDoc.getName());
833 } catch (gate.creole.ResourceInstantiationException ex) {
834 Out.prln("Cannot create document from file: " +
835 markedDocFile.getAbsolutePath());
836 continue;
837 }
838
839 } } else {
841 try {
842 DataStore sds1 = Factory.openDataStore
845 ("gate.persist.SerialDataStore",
846 markedDir.toURL().toExternalForm());
847
848 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
849 boolean found = false;
850 int k = 0;
851 while (k < lrIDs1.size() && !found) {
853 String docID1 = (String) lrIDs1.get(k);
854
855 FeatureMap features1 = Factory.newFeatureMap();
857 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
858 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
859 Document tempDoc = (Document) Factory.createResource(
860 "gate.corpora.DocumentImpl",
861 features1, hparams);
862 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
864 endsWith(cleanDoc.getName())) {
865 found = true;
866 markedDoc = tempDoc;
867 } else k++;
868 }
869 } catch (java.net.MalformedURLException ex) {
870 Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
871 } catch (gate.persist.PersistenceException ex1) {
872 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
873 } catch (gate.creole.ResourceInstantiationException ex2) {
874 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
875 }
876 }
878 try {
879 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
880 } catch (gate.creole.ResourceInstantiationException ex) {
881 ex.printStackTrace();
882 Out.prln("Evaluate failed on document: " + cleanDoc.getName());
883 }
884 if (persDoc != null)
885 Factory.deleteResource(persDoc);
886 if (cleanDoc != null)
887 Factory.deleteResource(cleanDoc);
888 if (markedDoc != null)
889 Factory.deleteResource(markedDoc);
890
891 }
893
894 }
896 protected void processDocument(Document doc) {
897 try {
898 if (application instanceof CorpusController) {
899 Corpus tempCorpus = Factory.newCorpus("temp");
900 tempCorpus.add(doc);
901 ((CorpusController)application).setCorpus(tempCorpus);
902 application.execute();
903 Factory.deleteResource(tempCorpus);
904 tempCorpus = null;
905 } else {
906 Iterator iter = application.getPRs().iterator();
907 while (iter.hasNext())
908 ((ProcessingResource) iter.next()).setParameterValue("document", doc);
909 application.execute();
910 }
911 } catch (ResourceInstantiationException ex) {
912 throw new RuntimeException("Error executing application: "
913 + ex.getMessage());
914 } catch (ExecutionException ex) {
915 throw new RuntimeException("Error executing application: "
916 + ex.getMessage());
917 }
918 }
919
920 protected void evaluateDocuments(Document persDoc,
921 Document cleanDoc, Document markedDoc,
922 File errDir)
923 throws ResourceInstantiationException {
924 if (cleanDoc == null && markedDoc == null)
925 return;
926
927 if (annotTypes == null || annotTypes.isEmpty())
929 return;
930
931 if (cleanDoc != null && !isMarkedStored) {
932
933 processDocument(cleanDoc);
934
935
936 int wordCount = countWords(cleanDoc);
937 if(wordCount == 0)
938 Out.prln("<BR>No Token annotations to count words in the document.");
939 else
940 Out.prln("<BR>Word count: " + wordCount);
941 corpusWordCount += wordCount;
942
943 if(!isMarkedClean)
944 evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
945 else
946 evaluateTwoDocs(markedDoc, cleanDoc, errDir);
947
948 } else
949 evaluateTwoDocs(markedDoc, persDoc, errDir);
950
951 }
952
953
956 protected int countWords(Document annotDoc) {
957 int count = 0;
958
959 if (annotDoc == null) return 0;
960 AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
962 if (tokens == null) return 0;
963
964 Iterator it = tokens.iterator();
965 Annotation currAnnotation;
966 while (it.hasNext()) {
967 currAnnotation = (Annotation) it.next();
968 Object feature = currAnnotation.getFeatures().get("kind");
969 if(feature != null && "word".equalsIgnoreCase((String)feature)) ++count;
970 }
972 return count;
973 }
974
975 protected void evaluateAllThree(Document persDoc,
976 Document cleanDoc, Document markedDoc,
977 File errDir)
978 throws ResourceInstantiationException {
979 printTableHeader();
981
982 Writer errWriter = null;
984 if (isMoreInfoMode && errDir != null) {
985 StringBuffer docName = new StringBuffer(cleanDoc.getName());
986 docName.replace(
987 cleanDoc.getName().lastIndexOf("."),
988 docName.length(),
989 ".err");
990 File errFile = new File(errDir, docName.toString());
991 String encoding = ((gate.corpora.DocumentImpl)cleanDoc).getEncoding();
992 try {
993 errWriter = new FileWriter(errFile, false);
994
1002 }
1003 catch (Exception ex) {
1004 Out.prln("Exception when creating the error file " + errFile + ": "
1005 + ex.getMessage());
1006 errWriter = null;
1007 }
1008 }
1009
1010 for (int jj= 0; jj< annotTypes.size(); jj++) {
1011 String annotType = (String) annotTypes.get(jj);
1012
1013 AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1014 if (annotDiffer == null)
1016 continue;
1017
1018 docNumber++;
1020 updateStatistics(annotDiffer, annotType);
1022
1023 AnnotationDiffer annotDiffer1 =
1024 measureDocs(markedDoc, persDoc, annotType);
1025
1026 Out.prln("<TR>");
1027
1028 if(isMoreInfoMode && annotDiffer1 != null
1029 && (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1030 || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1031 )
1032 Out.prln("<TD> " + annotType + "_new"+ "</TD>");
1033 else
1034 Out.prln("<TD> " + annotType + "</TD>");
1035
1036 if (isMoreInfoMode) {
1037 if(annotDiffer1 != null) updateStatisticsProc(annotDiffer1, annotType);
1038
1039 Out.prln("<TD>" + annotDiffer.getCorrectMatches() + "</TD>");
1040 Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() + "</TD>");
1041 Out.prln("<TD>" + annotDiffer.getMissing() + "</TD>");
1042 Out.prln("<TD>" + annotDiffer.getSpurious() + "</TD>");
1043 }
1044
1045 Out.prln("<TD>");
1046
1047 if (annotDiffer1 != null) {
1049
1050 if (annotDiffer1.getPrecisionAverage()
1051 < annotDiffer.getPrecisionAverage()) {
1052 Out.prln("<P><Font color=blue> ");
1053 Out.prln(annotDiffer.getPrecisionAverage());
1054
1055 if(!isMoreInfoMode) {
1056 Out.pr("<BR>Precision increase on human-marked from ");
1057 Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1058 Out.prln(annotDiffer.getPrecisionAverage());
1059 }
1060 Out.prln(" </Font></P>");
1061 }
1062 else if (annotDiffer1.getPrecisionAverage()
1063 > annotDiffer.getPrecisionAverage()) {
1064 Out.prln("<P><Font color=red> ");
1065 Out.prln(annotDiffer.getPrecisionAverage());
1066
1067 if(!isMoreInfoMode) {
1068 Out.pr("<BR>Precision decrease on human-marked from ");
1069 Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1070 Out.prln(annotDiffer.getPrecisionAverage());
1071 }
1072 Out.prln(" </Font></P>");
1073 }
1074 else
1075 Out.prln("<P> " + (double) annotDiffer.getPrecisionAverage() + " </P>");
1076 }
1077 else
1078 Out.prln("<P> " + annotDiffer.getPrecisionAverage() + " </P>");
1079
1080 Out.prln("</TD>");
1081
1082 Out.prln("<TD>");
1083
1084 if (annotDiffer1 != null) {
1086
1087 if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1088 Out.prln("<P><Font color=blue> ");
1089 Out.prln(annotDiffer.getRecallAverage());
1090
1091 if(!isMoreInfoMode) {
1092 Out.pr("<BR>Recall increase on human-marked from ");
1093 Out.pr(annotDiffer1.getRecallAverage() + " to ");
1094 Out.prln(annotDiffer.getRecallAverage());
1095 }
1096 Out.prln(" </Font></P>");
1097 }
1098 else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1099 Out.prln("<P><Font color=red> ");
1100 Out.prln(annotDiffer.getRecallAverage());
1101
1102 if(!isMoreInfoMode) {
1103 Out.pr("<BR>Recall decrease on human-marked from ");
1104 Out.pr(annotDiffer1.getRecallAverage() + " to ");
1105 Out.prln(annotDiffer.getRecallAverage());
1106 }
1107 Out.prln(" </Font></P>");
1108 }
1109 else
1110 Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1111 } else
1112 Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1113
1114
1115 Out.prln("</TD>");
1116
1117 if ( isVerboseMode ) {
1119 Out.prln("<TD>");
1120 if (annotDiffer.getRecallAverage() < threshold) {
1121 printAnnotations(annotDiffer, markedDoc, cleanDoc);
1122 }
1123 else {
1124 Out.prln(" ");
1125 }
1126 Out.prln("</TD>");
1127 }
1128
1129 Out.prln("</TR>");
1130
1131 if(isMoreInfoMode && annotDiffer1 != null
1133 && (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1134 || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1135 ) {
1136
1137 Out.prln("<TR>");
1138 Out.prln("<TD> " + annotType + "_old" + "</TD>");
1139
1140 Out.prln("<TD>" + annotDiffer1.getCorrectMatches() + "</TD>");
1141 Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() + "</TD>");
1142 Out.prln("<TD>" + annotDiffer1.getMissing() + "</TD>");
1143 Out.prln("<TD>" + annotDiffer1.getSpurious() + "</TD>");
1144
1145 Out.prln("<TD>");
1146 if (annotDiffer1.getPrecisionAverage() < annotDiffer.getPrecisionAverage())
1147
1148 Out.prln("<P><Font color=blue> " + annotDiffer1.getPrecisionAverage()
1149 + "</Font></P>");
1150 else if (annotDiffer1.getPrecisionAverage() > annotDiffer.getPrecisionAverage())
1151 Out.prln(
1152 "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1153 + " </Font></P>");
1154 else
1155 Out.prln(annotDiffer1.getPrecisionAverage());
1156
1157 Out.prln("</TD>");
1158
1159 Out.prln("<TD>");
1160 if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1161 Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1162 + " </Font></P>");
1163 else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1164 Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1165 + " </Font></P>");
1166 else
1167 Out.prln(annotDiffer1.getRecallAverage());
1168
1169 Out.prln("</TD>");
1170
1171 if ( isVerboseMode ) {
1173
1175 Out.prln("<TD>");
1176 if (annotDiffer.getRecallAverage() < threshold) {
1177 printAnnotations(annotDiffer, markedDoc, cleanDoc);
1178 }
1179 else {
1180 Out.prln(" ");
1181 }
1182 Out.prln("</TD>");
1183 }
1184 Out.prln("</TR>");
1185 }
1187 if (isMoreInfoMode && errDir != null)
1188 storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1189 } Out.prln("</TABLE>");
1191
1192 try {
1193 if(errWriter != null)
1194 errWriter.close();
1195 }
1196 catch (Exception ex) {
1197 Out.prln("Exception on close of error file " + errWriter + ": "
1198 + ex.getMessage());
1199 }
1200 }
1202 protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1203 File errDir)
1204 throws ResourceInstantiationException {
1205
1206 printTableHeader();
1208
1209 Writer errWriter = null;
1211 if (isMoreInfoMode && errDir != null) {
1212 StringBuffer docName = new StringBuffer(keyDoc.getName());
1213 docName.replace(
1214 keyDoc.getName().lastIndexOf("."),
1215 docName.length(),
1216 ".err");
1217 File errFile = new File(errDir, docName.toString());
1218 String encoding = ((gate.corpora.DocumentImpl)keyDoc).getEncoding();
1219 try {
1220 errWriter = new FileWriter(errFile, false);
1221
1229 }
1230 catch (Exception ex) {
1231 Out.prln("Exception when creating the error file " + errFile + ": "
1232 + ex.getMessage());
1233 errWriter = null;
1234 }
1235 }
1236
1237 for (int jj= 0; jj< annotTypes.size(); jj++) {
1238 String annotType = (String) annotTypes.get(jj);
1239
1240 AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1241 if (annotDiff == null)
1243 continue;
1244
1245 docNumber++;
1247 updateStatistics(annotDiff, annotType);
1249
1250 Out.prln("<TR>");
1251 Out.prln("<TD>" + annotType + "</TD>");
1252
1253 if(isMoreInfoMode) {
1254 Out.prln("<TD>" + annotDiff.getCorrectMatches() + "</TD>");
1255 Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() + "</TD>");
1256 Out.prln("<TD>" + annotDiff.getMissing() + "</TD>");
1257 Out.prln("<TD>" + annotDiff.getSpurious() + "</TD>");
1258 }
1259
1260 Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
1261 Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
1262 if ( isVerboseMode ) {
1264 Out.prln("<TD>");
1265 if (annotDiff.getRecallAverage() < threshold) {
1266 printAnnotations(annotDiff, keyDoc, respDoc);
1267 }
1268 else {
1269 Out.prln(" ");
1270 }
1271 Out.prln("</TD>");
1272 }
1273 Out.prln("</TR>");
1274
1275 if (isMoreInfoMode && errDir != null)
1276 storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1277 } Out.prln("</TABLE>");
1279
1280 try {
1281 if(errWriter != null)
1282 errWriter.close();
1283 }
1284 catch (Exception ex) {
1285 Out.prln("Exception on close of error file " + errWriter + ": "
1286 + ex.getMessage());
1287 }
1288 }
1290 protected void printTableHeader() {
1291 Out.prln("<TABLE BORDER=1");
1292 Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1293
1294 if(isMoreInfoMode)
1295 Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1296 + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1297
1298 Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1299
1300 if (isVerboseMode)
1301 Out.pr("<TD><B>Annotations</B></TD>");
1302
1303 Out.prln("</TR>");
1304 }
1305
1306 protected void updateStatistics(AnnotationDiffer annotDiffer, String annotType){
1307 double precisionAverage = ((double)((double)annotDiffer.getPrecisionLenient() + annotDiffer.getPrecisionStrict()) /
1308(double)(2.0));
1309 if(precisionAverage != Double.NaN)
1310 precisionSum += precisionAverage;
1311
1312 double recallAverage = ((double)(annotDiffer.getRecallLenient() + annotDiffer.getRecallStrict()) / (double) (2.0));
1313 if(recallAverage != Double.NaN)
1314 recallSum += recallAverage;
1315
1316 double fMeasureAverage = ((double) (annotDiffer.getFMeasureLenient(1.0) + annotDiffer.getFMeasureStrict(1.0)) /
1317(double) (2.0));
1318 if(fMeasureAverage != Double.NaN)
1319 fMeasureSum += fMeasureAverage;
1320
1321 Double oldPrecision = (Double) precisionByType.get(annotType);
1322 if (oldPrecision == null)
1323 precisionByType.put(annotType, new Double(precisionAverage));
1324 else
1325 precisionByType.put(annotType, new Double(oldPrecision.doubleValue() + precisionAverage));
1326
1327 Integer precCount = (Integer) prCountByType.get(annotType);
1328 if (precCount == null)
1329 prCountByType.put(annotType, new Integer(1));
1330 else
1331 prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1332
1333
1334 Double oldFMeasure = (Double) fMeasureByType.get(annotType);
1335 if (oldFMeasure == null)
1336 fMeasureByType.put(annotType, new Double(fMeasureAverage));
1337 else
1338 fMeasureByType.put(annotType, new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1339
1340 Integer fCount = (Integer) fMeasureCountByType.get(annotType);
1341 if (fCount == null)
1342 fMeasureCountByType.put(annotType, new Integer(1));
1343 else
1344 fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1345
1346 Double oldRecall = (Double) recallByType.get(annotType);
1347 if (oldRecall == null)
1348 recallByType.put(annotType, new Double(recallAverage));
1349 else
1350 recallByType.put(annotType, new Double(oldRecall.doubleValue() + recallAverage));
1351
1352 Integer recCount = (Integer) recCountByType.get(annotType);
1353 if (recCount == null)
1354 recCountByType.put(annotType, new Integer(1));
1355 else
1356 recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1357
1358 Long oldMissingNo = (Long) missingByType.get(annotType);
1360 if (oldMissingNo == null)
1361 missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1362 else
1363 missingByType.put(annotType, new Long(oldMissingNo.longValue() + annotDiffer.getMissing()));
1364
1365 Long oldCorrectNo = (Long) correctByType.get(annotType);
1366 if (oldCorrectNo == null)
1367 correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1368 else
1369 correctByType.put(annotType, new Long(oldCorrectNo.longValue() + annotDiffer.getCorrectMatches()));
1370
1371 Long oldPartialNo = (Long) partialByType.get(annotType);
1372 if (oldPartialNo == null)
1373 partialByType.put(annotType, new Long(annotDiffer.getPartiallyCorrectMatches()));
1374 else
1375 partialByType.put(annotType, new Long(oldPartialNo.longValue() + annotDiffer.getPartiallyCorrectMatches()));
1376
1377 Long oldSpuriousNo = (Long) spurByType.get(annotType);
1378 if (oldSpuriousNo == null)
1379 spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1380 else
1381 spurByType.put(annotType, new Long(oldSpuriousNo.longValue() + annotDiffer.getSpurious()));
1382 }
1383
1384
1388 protected void updateStatisticsProc(AnnotationDiffer annotDiffer, String annotType){
1389 hasProcessed = true;
1390 double precisionAverage = ((double)(annotDiffer.getPrecisionLenient() + annotDiffer.getPrecisionStrict()) /
1391(double)(2.0));
1392 if(precisionAverage != Double.NaN)
1393 proc_precisionSum += precisionAverage;
1394
1395 double recallAverage = ((double)(annotDiffer.getRecallLenient() + annotDiffer.getRecallStrict()) / (double) (2.0));
1396 if(recallAverage != Double.NaN)
1397 proc_recallSum += recallAverage;
1398
1399 double fMeasureAverage = ((double) (annotDiffer.getFMeasureLenient(1.0) + annotDiffer.getFMeasureStrict(1.0)) /
1400(double) (2.0));
1401 if(fMeasureAverage != Double.NaN)
1402 proc_fMeasureSum += fMeasureAverage;
1403
1404 Double oldPrecision = (Double) proc_precisionByType.get(annotType);
1405 if (oldPrecision == null)
1406 proc_precisionByType.put(annotType, new Double(precisionAverage));
1407 else
1408 proc_precisionByType.put(annotType,
1409 new Double(oldPrecision.doubleValue() +
1410 precisionAverage));
1411 Integer precCount = (Integer) proc_prCountByType.get(annotType);
1412 if (precCount == null)
1413 proc_prCountByType.put(annotType, new Integer(1));
1414 else
1415 proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1416
1417
1418 Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType);
1419 if (oldFMeasure == null)
1420 proc_fMeasureByType.put(annotType,
1421 new Double(fMeasureAverage));
1422 else
1423 proc_fMeasureByType.put(annotType,
1424 new Double(oldFMeasure.doubleValue() +
1425 fMeasureAverage));
1426 Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType);
1427 if (fCount == null)
1428 proc_fMeasureCountByType.put(annotType, new Integer(1));
1429 else
1430 proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1431
1432 Double oldRecall = (Double) proc_recallByType.get(annotType);
1433 if (oldRecall == null)
1434 proc_recallByType.put(annotType,
1435 new Double(recallAverage));
1436 else
1437 proc_recallByType.put(annotType,
1438 new Double(oldRecall.doubleValue() +
1439 recallAverage));
1440 Integer recCount = (Integer) proc_recCountByType.get(annotType);
1441 if (recCount == null)
1442 proc_recCountByType.put(annotType, new Integer(1));
1443 else
1444 proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1445
1446 Long oldMissingNo = (Long) proc_missingByType.get(annotType);
1448 if (oldMissingNo == null)
1449 proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1450 else
1451 proc_missingByType.put(annotType,
1452 new Long(oldMissingNo.longValue() +
1453 annotDiffer.getMissing()));
1454
1455 Long oldCorrectNo = (Long) proc_correctByType.get(annotType);
1456 if (oldCorrectNo == null)
1457 proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1458 else
1459 proc_correctByType.put(annotType,
1460 new Long(oldCorrectNo.longValue() +
1461 annotDiffer.getCorrectMatches()));
1462
1463 Long oldPartialNo = (Long) proc_partialByType.get(annotType);
1464 if (oldPartialNo == null)
1465 proc_partialByType.put(annotType, new Long(annotDiffer.getPartiallyCorrectMatches()));
1466 else
1467 proc_partialByType.put(annotType,
1468 new Long(oldPartialNo.longValue() +
1469 annotDiffer.getPartiallyCorrectMatches()));
1470
1471 Long oldSpuriousNo = (Long) proc_spurByType.get(annotType);
1472 if (oldSpuriousNo == null)
1473 proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1474 else
1475 proc_spurByType.put(annotType,
1476 new Long(oldSpuriousNo.longValue() +
1477 annotDiffer.getSpurious()));
1478 }
1479
1480 public void printStatistics() {
1481
1482 Out.prln("<H2> Statistics </H2>");
1483
1484
1515 if (annotTypes == null) {
1516 Out.prln("No types given for evaluation, cannot obtain precision/recall");
1517 return;
1518 }
1519 Out.prln("<table border=1>");
1520 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1521 "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1522 "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1523 "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1524 String annotType;
1525 for (int i = 0; i < annotTypes.size(); i++) {
1526 annotType = (String) annotTypes.get(i);
1527 printStatsForType(annotType);
1528 } Out.prln("</table>");
1530 }
1532 protected void printStatsForType(String annotType){
1533 long correct = (correctByType.get(annotType) == null)? 0 :
1534 ((Long)correctByType.get(annotType)).longValue();
1535 long partial = (partialByType.get(annotType) == null)? 0 :
1536 ((Long)partialByType.get(annotType)).longValue();
1537 long spurious = (spurByType.get(annotType) == null)? 0 :
1538 ((Long)spurByType.get(annotType)).longValue();
1539 long missing = (missingByType.get(annotType) == null)? 0:
1540 ((Long)missingByType.get(annotType)).longValue();
1541 long actual = correct + partial + spurious;
1542 long possible = correct + partial + missing;
1543
1546 double precision = (correct + 0.5 * partial) / actual;
1547 double recall = (correct + 0.5*partial)/possible;
1549 double fmeasure =
1551 ((beta*beta + 1)*precision*recall)
1552 /
1553 ((beta*beta*precision) + recall);
1554
1555
1556 long proc_correct=0;
1557 long proc_partial=0;
1558 long proc_spurious=0;
1559 long proc_missing=0;
1560 long proc_actual=0;
1561 long proc_possible=0;
1562 double proc_precision=0;
1563 double proc_recall=0;
1564 double proc_fmeasure=0;
1565
1566 if(hasProcessed) {
1567 proc_correct = (proc_correctByType.get(annotType) == null)? 0 :
1569 ((Long)proc_correctByType.get(annotType)).longValue();
1570 proc_partial = (proc_partialByType.get(annotType) == null)? 0 :
1571 ((Long)proc_partialByType.get(annotType)).longValue();
1572 proc_spurious = (proc_spurByType.get(annotType) == null)? 0 :
1573 ((Long)proc_spurByType.get(annotType)).longValue();
1574 proc_missing = (proc_missingByType.get(annotType) == null)? 0:
1575 ((Long)proc_missingByType.get(annotType)).longValue();
1576 proc_actual = proc_correct + proc_partial + proc_spurious;
1577 proc_possible = proc_correct + proc_partial + proc_missing;
1578 proc_precision = (proc_correct + 0.5*proc_partial)/proc_actual;
1581 proc_recall = (proc_correct + 0.5*proc_partial)/proc_possible;
1583 proc_fmeasure =
1585 ((beta*beta + 1)*proc_precision*proc_recall)
1586 /
1587 ((beta*beta*proc_precision) + proc_recall);
1588
1589 }
1590
1591 Out.prln("<TR>");
1593 if(hasProcessed)
1594 Out.prln("<TD>" + annotType+ "_new" + "</TD>");
1595 else
1596 Out.prln("<TD>" + annotType + "</TD>");
1597
1598 Out.prln("<TD>" + correct + "</TD>");
1599 Out.prln("<TD>" + partial + "</TD>");
1600 Out.prln("<TD>" + missing + "</TD>");
1601 Out.prln("<TD>" + spurious + "</TD>");
1602
1603 String strPrec = (isMoreInfoMode)?
1604 avgPrint(precision, 4)
1605 :Double.toString(precision);
1606 String strRec = (isMoreInfoMode)?
1607 avgPrint(recall, 4)
1608 :Double.toString(recall);
1609 String strFmes = (isMoreInfoMode)?
1610 avgPrint(fmeasure, 4)
1611 :Double.toString(fmeasure);
1612
1613 if(hasProcessed && (precision < proc_precision))
1614 Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1615 else if(hasProcessed && (precision > proc_precision))
1616 Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1617 else
1618 Out.prln("<TD>" + strPrec + "</TD>");
1619 if(hasProcessed && (recall < proc_recall))
1620 Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1621 else if(hasProcessed && (recall > proc_recall))
1622 Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1623 else
1624 Out.prln("<TD>" + strRec + "</TD>");
1625 Out.prln("<TD>" + strFmes + "</TD>");
1626 Out.prln("</TR>");
1627
1628 if(hasProcessed) {
1629 Out.prln("<TR>");
1631 Out.prln("<TD>" + annotType + "_old" + "</TD>");
1632
1633 Out.prln("<TD>" + proc_correct + "</TD>");
1634 Out.prln("<TD>" + proc_partial + "</TD>");
1635 Out.prln("<TD>" + proc_missing + "</TD>");
1636 Out.prln("<TD>" + proc_spurious + "</TD>");
1637
1638 String strProcPrec = (isMoreInfoMode)?
1639 avgPrint(proc_precision, 4)
1640 :Double.toString(proc_precision);
1641 String strProcRec = (isMoreInfoMode)?
1642 avgPrint(proc_recall, 4)
1643 :Double.toString(proc_recall);
1644 String strProcFmes = (isMoreInfoMode)?
1645 avgPrint(proc_fmeasure, 4)
1646 :Double.toString(proc_fmeasure);
1647
1648 if(precision < proc_precision)
1649 Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1650 else if(precision > proc_precision)
1651 Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1652 else
1653 Out.prln("<TD>" + strProcPrec + "</TD>");
1654 if(recall < proc_recall)
1655 Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1656 else if(recall > proc_recall)
1657 Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1658 else
1659 Out.prln("<TD>" + strProcRec + "</TD>");
1660 Out.prln("<TD>" + strProcFmes + "</TD>");
1661 Out.prln("</TR>");
1662 }
1663 }
1665 protected String avgPrint(double value, int count) {
1667 double newvalue;
1668 double power = Math.pow(10, count);
1669 newvalue = Math.round( value * power )/ power;
1670 return Double.toString(newvalue);
1671 }
1672
1673
1674 private double precisionSumCalc = 0;
1675 private double recallSumCalc = 0;
1676 private double fMeasureSumCalc = 0;
1677
1678 public double getPrecisionAverageCalc() {
1679 return precisionSumCalc;
1680 }
1681
1682 public double getRecallAverageCalc() {
1683 return recallSumCalc;
1684 }
1685
1686 public double getFmeasureAverageCalc() {
1687 return fMeasureSumCalc;
1688 }
1689
1690 protected void calculateAvgTotal() {
1691 long correct, partial, spurious, missing;
1692 long correctSum, partialSum, spuriousSum, missingSum;
1693
1694 if (annotTypes == null) {
1695 return;
1696 }
1697 correctSum = partialSum = spuriousSum = missingSum = 0;
1698
1699 String annotType;
1700 for (int i = 0; i < annotTypes.size(); i++) {
1701 annotType = (String) annotTypes.get(i);
1702 correct = (correctByType.get(annotType) == null)? 0 :
1703 ((Long)correctByType.get(annotType)).longValue();
1704 partial = (partialByType.get(annotType) == null)? 0 :
1705 ((Long)partialByType.get(annotType)).longValue();
1706 spurious = (spurByType.get(annotType) == null)? 0 :
1707 ((Long)spurByType.get(annotType)).longValue();
1708 missing = (missingByType.get(annotType) == null)? 0:
1709 ((Long)missingByType.get(annotType)).longValue();
1710 correctSum += correct;
1711 partialSum += partial;
1712 spuriousSum += spurious;
1713 missingSum += missing;
1714 }
1716 long actual = correctSum + partialSum + spuriousSum;
1717 long possible = correctSum + partialSum + missingSum;
1718
1719 if(actual == 0) {
1720 precisionSumCalc = 0;
1721 }
1722 else {
1723 precisionSumCalc = (correctSum + 0.5 * partialSum) / actual;
1724 }
1725
1726 if(possible == 0) {
1727 recallSumCalc = 0;
1728 }
1729 else {
1730 recallSumCalc = (correctSum + 0.5 * partialSum) / actual;
1731 }
1732
1733 if(precisionSumCalc == 0 && recallSumCalc == 0) {
1734 fMeasureSumCalc = 0;
1735 }
1736 else {
1737 fMeasureSumCalc =
1738 ((beta*beta + 1)*precisionSumCalc*recallSumCalc)
1739 /
1740 ((beta*beta*precisionSumCalc) + recallSumCalc);
1741
1742 }
1743 }
1745 protected AnnotationDiffer measureDocs(
1746 Document keyDoc, Document respDoc, String annotType)
1747 throws ResourceInstantiationException {
1748
1749 if (keyDoc == null || respDoc == null)
1750 return null;
1751
1752 if (annotSetName != null
1753 && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1754 return null;
1755 else if ((annotSetName == null || annotSetName.equals(""))
1756 && keyDoc.getAnnotations().get(annotType) == null)
1757 return null;
1758
1759 AnnotationDiffer annotDiffer = new AnnotationDiffer();
1761 annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1763 AnnotationSet keys, responses;
1765 if(annotSetName == null || annotSetName.equals("")) {
1766 keys = keyDoc.getAnnotations().get(annotType);
1767 responses = respDoc.getAnnotations().get(annotType);
1768 } else {
1769 keys = keyDoc.getAnnotations(annotSetName).get(annotType);
1770 responses = respDoc.getAnnotations(outputSetName).get(annotType);
1771 }
1772
1773 List pairings = annotDiffer.calculateDiff(keys,responses);
1775 return annotDiffer;
1776 }
1778 protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
1779 Document keyDoc, Document respDoc, Writer errFileWriter) {
1780 if(errFileWriter == null) return;
1782 try {
1783 Comparator comp = new OffsetComparator();
1785 TreeSet sortedSet = new TreeSet(comp);
1786 Set missingSet =
1787 annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1788 sortedSet.clear();
1789 sortedSet.addAll(missingSet);
1790 storeAnnotations(type+".miss", sortedSet, keyDoc, errFileWriter);
1791 Set spuriousSet =
1792 annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1793 sortedSet.clear();
1794 sortedSet.addAll(spuriousSet);
1795 storeAnnotations(type+".spur", sortedSet, respDoc, errFileWriter);
1796 Set partialSet =
1797 annotDiffer.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
1798 sortedSet.clear();
1799 sortedSet.addAll(partialSet);
1800 storeAnnotations(type+".part", sortedSet, respDoc, errFileWriter);
1801 } catch (Exception ex) {
1802 Out.prln("Exception on close of error file "+errFileWriter+": "
1803 +ex.getMessage());
1804 }
1805 }
1807 protected void storeAnnotations(String type, Set set, Document doc,
1808 Writer file) throws IOException{
1809
1810 if (set == null || set.isEmpty())
1811 return;
1812
1813 Iterator iter = set.iterator();
1814 Annotation ann;
1815 while (iter.hasNext()) {
1816 ann = (Annotation) iter.next();
1817 file.write(type);
1818 file.write(".");
1819 file.write(doc.getContent().toString().substring(
1820 ann.getStartNode().getOffset().intValue(),
1821 ann.getEndNode().getOffset().intValue()));
1822 file.write(".");
1823 file.write(ann.getStartNode().getOffset().toString());
1824 file.write(".");
1825 file.write(ann.getEndNode().getOffset().toString());
1826 file.write("\n");
1827 } }
1830 protected void printAnnotations(AnnotationDiffer annotDiff,
1831 Document keyDoc, Document respDoc) {
1832 Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
1833 Set missingSet =
1834 annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1835 printAnnotations(missingSet, keyDoc);
1836 Out.prln("<BR>");
1837
1838 Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
1839 Set spuriousSet =
1840 annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1841 printAnnotations(spuriousSet, respDoc);
1842 Out.prln("</BR>");
1843
1844 Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
1845 Set partialSet =
1846 annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
1847 printAnnotations(partialSet, respDoc);
1848 }
1849
1850 protected void printAnnotations(Set set, Document doc) {
1851 if (set == null || set.isEmpty())
1852 return;
1853
1854 Iterator iter = set.iterator();
1855 while (iter.hasNext()) {
1856 Annotation ann = (Annotation) iter.next();
1857 Out.prln(
1858 "<B>" +
1859 doc.getContent().toString().substring(
1860 ann.getStartNode().getOffset().intValue(),
1861 ann.getEndNode().getOffset().intValue()) +
1862 "</B>: <I>[" + ann.getStartNode().getOffset() +
1863 "," + ann.getEndNode().getOffset() + "]</I>"
1864 );
1866 } }
1869
1872 private File startDir;
1873 private File currDir;
1874 private static List annotTypes;
1875
1876 private Controller application = null;
1877 private File applicationFile = null;
1878
1879 private double precisionSum = 0.0;
1883 private double recallSum = 0.0;
1884 private double fMeasureSum = 0.0;
1885 private HashMap precisionByType = new HashMap();
1886 private HashMap prCountByType = new HashMap();
1887 private HashMap recallByType = new HashMap();
1888 private HashMap recCountByType = new HashMap();
1889 private HashMap fMeasureByType = new HashMap();
1890 private HashMap fMeasureCountByType = new HashMap();
1891
1892 private HashMap missingByType = new HashMap();
1893 private HashMap spurByType = new HashMap();
1894 private HashMap correctByType = new HashMap();
1895 private HashMap partialByType = new HashMap();
1896
1897 static boolean hasProcessed = false;
1899 private double proc_precisionSum = 0;
1900 private double proc_recallSum = 0;
1901 private double proc_fMeasureSum = 0;
1902 private HashMap proc_precisionByType = new HashMap();
1903 private HashMap proc_prCountByType = new HashMap();
1904 private HashMap proc_recallByType = new HashMap();
1905 private HashMap proc_recCountByType = new HashMap();
1906 private HashMap proc_fMeasureByType = new HashMap();
1907 private HashMap proc_fMeasureCountByType = new HashMap();
1908
1909 private HashMap proc_missingByType = new HashMap();
1910 private HashMap proc_spurByType = new HashMap();
1911 private HashMap proc_correctByType = new HashMap();
1912 private HashMap proc_partialByType = new HashMap();
1913
1914 double beta = 1;
1915
1916 private int docNumber = 0;
1917
1918
1922 private boolean isGenerateMode = false;
1923
1924
1927 private boolean isVerboseMode = false;
1928
1929
1932 private boolean isMoreInfoMode = false;
1933
1934
1938 private Set diffFeaturesSet;
1939
1940
1944 private boolean isMarkedStored = false;
1945 private boolean isMarkedClean = false;
1946 private boolean isMarkedDS = false;
1948
1949 private String annotSetName = "Key";
1950 private String outputSetName = null;
1951
1952 private double threshold = 0.5;
1953 private Properties configs = new Properties();
1954 private static int corpusWordCount = 0;
1955
1956 private String documentEncoding = "";
1957
1958
1959 private static String usage =
1960 "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
1961 +"[-verbose] [-moreinfo] directory-name application";
1962
1963}
1964