1
15
16 package gate.util;
17
18 import java.io.*;
19 import java.util.*;
20
21 import gate.*;
22 import gate.util.AnnotationDiffer;
23 import gate.creole.*;
24 import gate.persist.PersistenceException;
25 import gate.persist.SerialDataStore;
26
27 public class CorpusBenchmarkTool {
28 private static final String MARKED_DIR_NAME = "marked";
29 private static final String CLEAN_DIR_NAME = "clean";
30 private static final String CVS_DIR_NAME = "Cvs";
31 private static final String PROCESSED_DIR_NAME = "processed";
32 private static final String ERROR_DIR_NAME = "err";
33
34 private static final boolean DEBUG = true;
35
36 public CorpusBenchmarkTool() {}
37
38 public void initPRs() {
39 try {
40 if (applicationFile == null)
41 Out.prln("Application not set!");
42 Out.prln("App file is: " + applicationFile.getAbsolutePath());
43 application = (Controller) gate.util.persistence.PersistenceManager
44 .loadObjectFromFile(applicationFile);
45 } catch (Exception ex) {
46 throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
47 }
48 }
50 public void unloadPRs() {
51 if (isMarkedStored)
53 return;
54
55 }
56
57 public void execute() {
58
63 execute(startDir);
64 System.out.println("Done execute");
65
72 System.out.println("Done execute");
73 }
74
75 public void init() {
76 File propFile = new File("corpus_tool.properties");
78 Out.prln(propFile.getAbsolutePath());
79 if (propFile.exists()) {
80 try {
81 InputStream inputStream = new FileInputStream(propFile);
82 this.configs.load(inputStream);
83 String thresholdString = this.configs.getProperty("threshold");
84 if (thresholdString != null && !thresholdString.equals("")) {
85 this.threshold = (new Double(thresholdString)).doubleValue();
86 Out.prln("New threshold is: " + this.threshold + "<P>\n");
87 }
88 String setName = this.configs.getProperty("annotSetName");
89 if (setName != null && !setName.equals("")) {
90 Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
91 this.annotSetName = setName;
92 }
93 setName = this.configs.getProperty("outputSetName");
94 if (setName != null && !setName.equals("")) {
95 Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
96 this.outputSetName = setName;
97 }
98 String encodingString = this.configs.getProperty("encoding");
99 if (encodingString != null && !encodingString.equals("")) {
100 this.documentEncoding = encodingString;
101 Out.prln("New encoding is: " + this.documentEncoding + "<P>\n");
102 }
103 String types = this.configs.getProperty("annotTypes");
104 if (types != null && !types.equals("")) {
105 Out.prln("Using annotation types from the properties file. <P>\n");
106 StringTokenizer strTok = new StringTokenizer(types, ";");
107 annotTypes = new ArrayList();
108 while (strTok.hasMoreTokens())
109 annotTypes.add(strTok.nextToken());
110 } else {
111 annotTypes = new ArrayList();
112 annotTypes.add("Organization");
113 annotTypes.add("Person");
114 annotTypes.add("Date");
115 annotTypes.add("Location");
116 annotTypes.add("Address");
117 annotTypes.add("Money");
118 annotTypes.add("Percent");
119 annotTypes.add("GPE");
120 annotTypes.add("Facility");
121 }
122 String features = this.configs.getProperty("annotFeatures");
123 HashSet result = new HashSet();
124 if (features != null && !features.equals("")) {
125 Out.pr("Using annotation features from the properties file. \n");
126 java.util.StringTokenizer tok =
127 new java.util.StringTokenizer(features, ";");
128 String current;
129 while(tok.hasMoreTokens()) {
130 current = tok.nextToken();
131 result.add(current);
132 } }
134 diffFeaturesSet = result;
135 Out.prln("Features: "+diffFeaturesSet+" <P>\n");
136
137 } catch (IOException ex) {
138 this.configs = new Properties();
140 }
141 } else
142 this.configs = new Properties();
143
144
145 if (!this.isMarkedStored)
148 initPRs();
149
150 }
151
152 public void execute(File dir) {
153 if (dir == null)
154 return;
155 currDir = dir;
157
158 File processedDir = null;
159 File cleanDir = null;
160 File markedDir = null;
161 File errorDir = null;
162
163 ArrayList subDirs = new ArrayList();
164 File[] dirArray = currDir.listFiles();
165 if(dirArray == null) return;
166 for (int i = 0; i < dirArray.length; i++) {
167 if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
168 continue;
169 if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
170 cleanDir = dirArray[i];
171 else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
172 markedDir = dirArray[i];
173 else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
174 processedDir = dirArray[i];
175 else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
176 errorDir = dirArray[i];
177 else
178 subDirs.add(dirArray[i]);
179 }
180
181 if(cleanDir == null) return;
182 Out.prln("Processing directory: " + currDir + "<P>");
183
184 if (this.isGenerateMode)
185 generateCorpus(cleanDir, processedDir);
186 else
187 evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
188
189 if (subDirs.isEmpty())
191 return;
192
193 for (int j = 0; j < subDirs.size(); j++)
195 execute((File) subDirs.get(j));
196
197 }
199
200 public static void main(String[] args) throws GateException {
201 Out.prln("<HTML>");
202 Out.prln("<HEAD>");
203 Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
204 for(int argC=0; argC < args.length; ++argC)
205 Out.pr(args[argC]+" ");
206 Out.pr(" on " + new Date() + "</TITLE> </HEAD>");
207 Out.prln("<BODY>");
208 Out.prln("Please wait while GATE tools are initialised. <P>");
209 Gate.init();
211
212 CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
213
214 List inputFiles = null;
215 if(args.length < 1) throw new GateException(usage);
216 int i = 0;
217 while (i < args.length && args[i].startsWith("-")) {
218 if(args[i].equals("-generate")) {
219 Out.prln("Generating the corpus... <P>");
220 corpusTool.setGenerateMode(true);
221 } else if (args[i].equals("-marked_clean")) {
222 Out.prln("Evaluating current grammars against human-annotated...<P>");
223 corpusTool.setMarkedClean(true);
224 } else if (args[i].equals("-marked_stored")) {
225 Out.prln("Evaluating stored documents against human-annotated...<P>");
226 corpusTool.setMarkedStored(true);
227 } else if (args[i].equals("-marked_ds")) {
228 Out.prln("Looking for marked docs in a datastore...<P>");
229 corpusTool.setMarkedDS(true);
230 } else if (args[i].equals("-verbose")) {
231 Out.prln("Running in verbose mode. Will generate annotation " +
232 "information when precision/recall are lower than " +
233 corpusTool.getThreshold() +"<P>");
234 corpusTool.setVerboseMode(true);
235 } else if (args[i].equals("-moreinfo")) {
236 Out.prln("Show more details in document table...<P>");
237 corpusTool.setMoreInfo(true);
238 }
239 i++; }
242 String dirName = args[i];
243 File dir = new File(dirName);
244 if (!dir.isDirectory())
245 throw new GateException(usage);
246
247 i++;
249 String appName = args[i];
250 File appFile = new File(appName);
251 if (!appFile.isFile())
252 throw new GateException(usage);
253 else
254 corpusTool.setApplicationFile(appFile);
255
256 corpusTool.init();
257 corpusWordCount = 0;
258
259 Out.prln("Measuring annotaitions of types: " + CorpusBenchmarkTool.annotTypes + "<P>");
260
261 corpusTool.setStartDirectory(dir);
262 corpusTool.execute();
263 System.out.println("Done Executing");
264 if (! corpusTool.getGenerateMode())
267 corpusTool.printStatistics();
268
269 Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
270 Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
271 if(corpusWordCount == 0)
272 Out.prln("<BR>No Token annotations to count words in the corpus.");
273 else
274 Out.prln("<BR>Overall word count: " + corpusWordCount);
275
276
277 if(hasProcessed) {
278 Out.prln("<P>Old Processed: ");
279 Out.prln("<BR>Overall average precision: "
280 + corpusTool.getPrecisionAverageProc());
281 Out.prln("<BR>Overall average recall: "
282 + corpusTool.getRecallAverageProc());
283 }
284 Out.prln("<BR>Finished! <P>");
285 Out.prln("</BODY>");
286 Out.prln("</HTML>");
287
288 System.exit(0);
289
290 }
292 public void setGenerateMode(boolean mode) {
293 isGenerateMode = mode;
294 }
296 public boolean getGenerateMode() {
297 return isGenerateMode;
298 }
300 public boolean getVerboseMode() {
301 return isVerboseMode;
302 }
304 public void setVerboseMode(boolean mode) {
305 isVerboseMode = mode;
306 }
308 public void setMoreInfo(boolean mode) {
309 isMoreInfoMode = mode;
310 }
312 public boolean getMoreInfo() {
313 return isMoreInfoMode;
314 }
316 public void setDiffFeaturesList(Set features) {
317 diffFeaturesSet = features;
318 }
320 public Set getDiffFeaturesList() {
321 return diffFeaturesSet;
322 }
324 public void setMarkedStored(boolean mode) {
325 isMarkedStored = mode;
326 }
328
329 public boolean getMarkedStored() {
330 return isMarkedStored;
331 }
333 public void setMarkedClean(boolean mode) {
334 isMarkedClean = mode;
335 }
337 public boolean getMarkedClean() {
338 return isMarkedClean;
339 }
341 public void setMarkedDS(boolean mode) {
342 isMarkedDS = mode;
343 }
345 public boolean getMarkedDS() {
346 return isMarkedDS;
347 }
349 public void setApplicationFile(File newAppFile) {
350 applicationFile = newAppFile;
351 }
352
353
363 public double getPrecisionAverage() {
364 return precisionSum/docNumber;
365 }
366
367
377 public double getRecallAverage() {
378 return recallSum/docNumber;
379 }
380
381
382 public double getPrecisionAverageProc() {
383 return proc_precisionSum/docNumber;
384 }
385 public double getRecallAverageProc() {
386 return proc_recallSum/docNumber;
387 }
388
389
390 public boolean isGenerateMode() {
391 return isGenerateMode == true;
392 }
394 public double getThreshold() {
395 return threshold;
396 }
397
398 public void setThreshold(double newValue) {
399 threshold = newValue;
400 }
401
402 public File getStartDirectory() {
403 return startDir;
404 }
406 public void setStartDirectory(File dir) {
407 startDir = dir;
408 }
410 protected void generateCorpus(File fileDir, File outputDir) {
411 if (fileDir == null)
413 return;
414 File outDir = outputDir;
416 if (outputDir == null) {
417 outDir = new File(currDir, PROCESSED_DIR_NAME);
418 } else {
419 if (!Files.rmdir(outDir))
421 Out.prln("cannot delete old output directory: " + outDir);
422 }
423 outDir.mkdir();
424
425 try {
427 SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
428 sds.create();
429 sds.open();
430
431 File[] files = fileDir.listFiles();
432 for (int i=0; i < files.length; i++) {
433 if (!files[i].isFile())
434 continue;
435 Out.prln("Processing and storing document: " + files[i].toURL() +"<P>");
437
438 FeatureMap params = Factory.newFeatureMap();
439 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
440 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
441
442 Document doc = (Document) Factory.createResource(
444 "gate.corpora.DocumentImpl", params
445 );
446
447 doc.setName(files[i].getName());
448 if (doc == null)
449 continue;
450 processDocument(doc);
451 LanguageResource lr = sds.adopt(doc, null);
452 sds.sync(lr);
453 Factory.deleteResource(doc);
454 Factory.deleteResource(lr);
455 } sds.close();
457 } catch (java.net.MalformedURLException ex) {
458 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
459 } catch (PersistenceException ex1) {
460 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
461 } catch (ResourceInstantiationException ex2) {
462 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
463 } catch (gate.security.SecurityException ex3) {
464 throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage());
465 }
466 System.out.println("Done");
467 }
469 protected void evaluateCorpus(File fileDir,
470 File processedDir, File markedDir,
471 File errorDir) {
472 if (fileDir == null || !fileDir.exists())
474 return;
475 if (processedDir == null || !processedDir.exists())
476 if (isMarkedStored) {
478 Out.prln("Cannot evaluate because no processed documents exist.");
479 return;
480 }
481 else
482 isMarkedClean = true;
483
484 File errDir = null;
486 if(isMoreInfoMode) {
487 errDir = errorDir;
488 if (errDir == null) {
489 errDir = new File(currDir, ERROR_DIR_NAME);
490 }
491 else {
492 if (!Files.rmdir(errDir))
494 Out.prln("cannot delete old error directory: " + errDir);
495 }
496 Out.prln("Create error directory: " + errDir + "<BR><BR>");
497 errDir.mkdir();
498 }
499
500 boolean processMarked = markedDir != null && markedDir.exists();
502 if (!processMarked && (isMarkedStored || isMarkedClean)) {
503 Out.prln("Cannot evaluate because no human-annotated documents exist.");
504 return;
505 }
506
507 if (isMarkedStored) {
508 evaluateMarkedStored(markedDir, processedDir, errDir);
509 return;
510 } else if (isMarkedClean) {
511 evaluateMarkedClean(markedDir, fileDir, errDir);
512 return;
513 }
514
515 Document persDoc = null;
516 Document cleanDoc = null;
517 Document markedDoc = null;
518
519 try {
521 DataStore sds = Factory.openDataStore
523 ("gate.persist.SerialDataStore",
524 processedDir.toURL().toExternalForm());
525
526 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
527 for (int i=0; i < lrIDs.size(); i++) {
528 String docID = (String) lrIDs.get(i);
529
530 FeatureMap features = Factory.newFeatureMap();
532 features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
533 features.put(DataStore.LR_ID_FEATURE_NAME, docID);
534 persDoc = (Document) Factory.createResource(
535 "gate.corpora.DocumentImpl",
536 features);
537
538 if(isMoreInfoMode) {
539 StringBuffer errName = new StringBuffer(persDoc.getName());
540 errName.replace(
541 persDoc.getName().lastIndexOf("."),
542 persDoc.getName().length(),
543 ".err");
544 Out.prln("<H2>" +
545 "<a href=\"err/" + errName.toString() + "\">"
546 + persDoc.getName() + "</a>" + "</H2>");
547 } else
548 Out.prln("<H2>" + persDoc.getName() + "</H2>");
549
550 File cleanDocFile = new File(fileDir, persDoc.getName());
551 if (! cleanDocFile.exists()) {
553 Out.prln("Warning: Cannot find original document " +
554 persDoc.getName() + " in " + fileDir);
555 } else {
556 FeatureMap params = Factory.newFeatureMap();
557 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
558 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
559
560 cleanDoc = (Document) Factory.createResource(
562 "gate.corpora.DocumentImpl", params);
563 cleanDoc.setName(persDoc.getName());
564 }
565
566 StringBuffer docName = new StringBuffer(persDoc.getName());
568 if (! isMarkedDS) {
569 docName.replace(
570 persDoc.getName().lastIndexOf("."),
571 docName.length(),
572 ".xml");
573 File markedDocFile = new File(markedDir, docName.toString());
574 if (! processMarked || ! markedDocFile.exists()) {
575 Out.prln("Warning: Cannot find human-annotated document " +
576 markedDocFile + " in " + markedDir);
577 } else {
578 FeatureMap params = Factory.newFeatureMap();
579 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
580 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
581
582 markedDoc = (Document) Factory.createResource(
584 "gate.corpora.DocumentImpl", params);
585 markedDoc.setName(persDoc.getName());
586 }
587 } else {
588 DataStore sds1 = Factory.openDataStore
591 ("gate.persist.SerialDataStore",
592 markedDir.toURL().toExternalForm());
593
594 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
595 boolean found = false;
596 int k = 0;
597 while (k < lrIDs1.size() && !found) {
599 String docID1 = (String) lrIDs1.get(k);
600
601 FeatureMap features1 = Factory.newFeatureMap();
603 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
604 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
605 Document tempDoc = (Document) Factory.createResource(
606 "gate.corpora.DocumentImpl",
607 features1);
608 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
610 endsWith(persDoc.getName())) {
611 found = true;
612 markedDoc = tempDoc;
613 } else k++;
614 }
615 }
616
617 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
618 if (persDoc != null)
619 Factory.deleteResource(persDoc);
620 if (cleanDoc != null)
621 Factory.deleteResource(cleanDoc);
622 if (markedDoc != null)
623 Factory.deleteResource(markedDoc);
624
625 } sds.close();
627 } catch (java.net.MalformedURLException ex) {
628 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
629 } catch (PersistenceException ex1) {
630 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
631 } catch (ResourceInstantiationException ex2) {
632 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
633 }
634
635 }
637 protected void evaluateMarkedStored(File markedDir, File storedDir, File errDir) {
638 Document persDoc = null;
639 Document cleanDoc = null;
640 Document markedDoc = null;
641
642 try {
644 DataStore sds = Factory.openDataStore
646 ("gate.persist.SerialDataStore",
647 storedDir.toURL().toExternalForm());
648
649 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
650 for (int i=0; i < lrIDs.size(); i++) {
651 String docID = (String) lrIDs.get(i);
652
653 FeatureMap features = Factory.newFeatureMap();
655 features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
656 features.put(DataStore.LR_ID_FEATURE_NAME, docID);
657 persDoc = (Document) Factory.createResource(
658 "gate.corpora.DocumentImpl",
659 features);
660
661 if(isMoreInfoMode) {
662 StringBuffer errName = new StringBuffer(persDoc.getName());
663 errName.replace(
664 persDoc.getName().lastIndexOf("."),
665 persDoc.getName().length(),
666 ".err");
667 Out.prln("<H2>" +
668 "<a href=\"err/" + errName.toString() + "\">"
669 + persDoc.getName() + "</a>" + "</H2>");
670 } else
671 Out.prln("<H2>" + persDoc.getName() + "</H2>");
672
673 if (! this.isMarkedDS) { StringBuffer docName = new StringBuffer(persDoc.getName());
675 docName.replace(
676 persDoc.getName().lastIndexOf("."),
677 docName.length(),
678 ".xml");
679 File markedDocFile = new File(markedDir, docName.toString());
680 if (! markedDocFile.exists()) {
681 Out.prln("Warning: Cannot find human-annotated document " +
682 markedDocFile + " in " + markedDir);
683 } else {
684 FeatureMap params = Factory.newFeatureMap();
685 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
686 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
687
688 markedDoc = (Document) Factory.createResource(
690 "gate.corpora.DocumentImpl", params);
691 markedDoc.setName(persDoc.getName());
692 } } else {
694 try {
695 DataStore sds1 = Factory.openDataStore
698 ("gate.persist.SerialDataStore",
699 markedDir.toURL().toExternalForm());
700
701 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
702 boolean found = false;
703 int k = 0;
704 while (k < lrIDs1.size() && !found) {
706 String docID1 = (String) lrIDs1.get(k);
707
708 FeatureMap features1 = Factory.newFeatureMap();
710 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
711 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
712 Document tempDoc = (Document) Factory.createResource(
713 "gate.corpora.DocumentImpl",
714 features1);
715 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
717 endsWith(persDoc.getName())) {
718 found = true;
719 markedDoc = tempDoc;
720 } else k++;
721 }
722 } catch (java.net.MalformedURLException ex) {
723 Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
724 } catch (gate.persist.PersistenceException ex1) {
725 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
726 } catch (gate.creole.ResourceInstantiationException ex2) {
727 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
728 }
729 }
730
731 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
732 if (persDoc != null)
733 Factory.deleteResource(persDoc);
734 if (markedDoc != null)
735 Factory.deleteResource(markedDoc);
736
737 } sds.close();
739
740 } catch (java.net.MalformedURLException ex) {
741 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
742 } catch (PersistenceException ex1) {
743 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
744 } catch (ResourceInstantiationException ex2) {
745 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
746 }
747
748 }
750
751 protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
752 Document persDoc = null;
753 Document cleanDoc = null;
754 Document markedDoc = null;
755
756 File[] cleanDocs = cleanDir.listFiles();
757 for (int i = 0; i< cleanDocs.length; i++) {
758 if (!cleanDocs[i].isFile())
759 continue;
760
761 FeatureMap params = Factory.newFeatureMap();
763 try {
764 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
765 } catch (java.net.MalformedURLException ex) {
766 Out.prln("Cannot create document from file: " +
767 cleanDocs[i].getAbsolutePath());
768 continue;
769 }
770 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
771
772 try {
774 cleanDoc = (Document) Factory.createResource(
775 "gate.corpora.DocumentImpl", params,
776 null, cleanDocs[i].getName());
777 } catch (gate.creole.ResourceInstantiationException ex) {
778 Out.prln("Cannot create document from file: " +
779 cleanDocs[i].getAbsolutePath());
780 continue;
781 }
782
783 if(isMoreInfoMode) {
784 StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
785 errName.replace(
786 cleanDocs[i].getName().lastIndexOf("."),
787 cleanDocs[i].getName().length(),
788 ".err");
789 Out.prln("<H2>" +
790 "<a href=\"err/" + errName.toString() + "\">"
791 + cleanDocs[i].getName() + "</a>" + "</H2>");
792 } else
793 Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>");
794
795 if (! isMarkedDS) {
797 StringBuffer docName = new StringBuffer(cleanDoc.getName());
798 docName.replace(
799 cleanDoc.getName().lastIndexOf("."),
800 docName.length(),
801 ".xml");
802 File markedDocFile = new File(markedDir, docName.toString());
803 if (! markedDocFile.exists()) {
804 Out.prln("Warning: Cannot find human-annotated document " +
805 markedDocFile + " in " + markedDir);
806 continue;
807 } else {
808 params = Factory.newFeatureMap();
809 try {
810 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
811 } catch (java.net.MalformedURLException ex) {
812 Out.prln("Cannot create document from file: " +
813 markedDocFile.getAbsolutePath());
814 continue;
815 }
816 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
817
818 try {
820 markedDoc = (Document) Factory.createResource(
821 "gate.corpora.DocumentImpl", params,
822 null, cleanDoc.getName());
823 } catch (gate.creole.ResourceInstantiationException ex) {
824 Out.prln("Cannot create document from file: " +
825 markedDocFile.getAbsolutePath());
826 continue;
827 }
828
829 } } else {
831 try {
832 DataStore sds1 = Factory.openDataStore
835 ("gate.persist.SerialDataStore",
836 markedDir.toURL().toExternalForm());
837
838 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
839 boolean found = false;
840 int k = 0;
841 while (k < lrIDs1.size() && !found) {
843 String docID1 = (String) lrIDs1.get(k);
844
845 FeatureMap features1 = Factory.newFeatureMap();
847 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
848 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
849 Document tempDoc = (Document) Factory.createResource(
850 "gate.corpora.DocumentImpl",
851 features1);
852 if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
854 endsWith(cleanDoc.getName())) {
855 found = true;
856 markedDoc = tempDoc;
857 } else k++;
858 }
859 } catch (java.net.MalformedURLException ex) {
860 Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
861 } catch (gate.persist.PersistenceException ex1) {
862 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
863 } catch (gate.creole.ResourceInstantiationException ex2) {
864 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
865 }
866 }
868 try {
869 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
870 } catch (gate.creole.ResourceInstantiationException ex) {
871 ex.printStackTrace();
872 Out.prln("Evaluate failed on document: " + cleanDoc.getName());
873 }
874 if (persDoc != null)
875 Factory.deleteResource(persDoc);
876 if (cleanDoc != null)
877 Factory.deleteResource(cleanDoc);
878 if (markedDoc != null)
879 Factory.deleteResource(markedDoc);
880
881 }
883
884 }
886 protected void processDocument(Document doc) {
887 try {
888 if (application instanceof CorpusController) {
889 Corpus tempCorpus = Factory.newCorpus("temp");
890 tempCorpus.add(doc);
891 ((CorpusController)application).setCorpus(tempCorpus);
892 application.execute();
893 Factory.deleteResource(tempCorpus);
894 tempCorpus = null;
895 } else {
896 Iterator iter = application.getPRs().iterator();
897 while (iter.hasNext())
898 ((ProcessingResource) iter.next()).setParameterValue("document", doc);
899 application.execute();
900 }
901 } catch (ResourceInstantiationException ex) {
902 throw new RuntimeException("Error executing application: "
903 + ex.getMessage());
904 } catch (ExecutionException ex) {
905 throw new RuntimeException("Error executing application: "
906 + ex.getMessage());
907 }
908 }
909
910 protected void evaluateDocuments(Document persDoc,
911 Document cleanDoc, Document markedDoc,
912 File errDir)
913 throws ResourceInstantiationException {
914 if (cleanDoc == null && markedDoc == null)
915 return;
916
917 if (annotTypes == null || annotTypes.isEmpty())
919 return;
920
921 if (cleanDoc != null && !isMarkedStored) {
922
923 processDocument(cleanDoc);
924
925
926 int wordCount = countWords(cleanDoc);
927 if(wordCount == 0)
928 Out.prln("<BR>No Token annotations to count words in the document.");
929 else
930 Out.prln("<BR>Word count: " + wordCount);
931 corpusWordCount += wordCount;
932
933 if(!isMarkedClean)
934 evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
935 else
936 evaluateTwoDocs(markedDoc, cleanDoc, errDir);
937
938 } else
939 evaluateTwoDocs(markedDoc, persDoc, errDir);
940
941 }
942
943
946 protected int countWords(Document annotDoc) {
947 int count = 0;
948
949 if (annotDoc == null) return 0;
950 AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
952 if (tokens == null) return 0;
953
954 Iterator it = tokens.iterator();
955 Annotation currAnnotation;
956 while (it.hasNext()) {
957 currAnnotation = (Annotation) it.next();
958 Object feature = currAnnotation.getFeatures().get("kind");
959 if(feature != null && "word".equalsIgnoreCase((String)feature)) ++count;
960 }
962 return count;
963 }
964
965 protected void evaluateAllThree(Document persDoc,
966 Document cleanDoc, Document markedDoc,
967 File errDir)
968 throws ResourceInstantiationException {
969 printTableHeader();
971
972 Writer errWriter = null;
974 if (isMoreInfoMode && errDir != null) {
975 StringBuffer docName = new StringBuffer(cleanDoc.getName());
976 docName.replace(
977 cleanDoc.getName().lastIndexOf("."),
978 docName.length(),
979 ".err");
980 File errFile = new File(errDir, docName.toString());
981 String encoding = ((gate.corpora.DocumentImpl)cleanDoc).getEncoding();
982 try {
983 errWriter = new FileWriter(errFile, false);
984
992 }
993 catch (Exception ex) {
994 Out.prln("Exception when creating the error file " + errFile + ": "
995 + ex.getMessage());
996 errWriter = null;
997 }
998 }
999
1000 for (int jj= 0; jj< annotTypes.size(); jj++) {
1001 String annotType = (String) annotTypes.get(jj);
1002
1003 AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1004 if (annotDiffer == null)
1006 continue;
1007
1008 docNumber++;
1010 updateStatistics(annotDiffer, annotType);
1012
1013 AnnotationDiffer annotDiffer1 =
1014 measureDocs(markedDoc, persDoc, annotType);
1015
1016 Out.prln("<TR>");
1017
1018 if(isMoreInfoMode && annotDiffer1 != null
1019 && (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1020 || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1021 )
1022 Out.prln("<TD> " + annotType + "_new"+ "</TD>");
1023 else
1024 Out.prln("<TD> " + annotType + "</TD>");
1025
1026 if (isMoreInfoMode) {
1027 if(annotDiffer1 != null) updateStatisticsProc(annotDiffer1, annotType);
1028
1029 Out.prln("<TD>" + annotDiffer.getCorrectMatches() + "</TD>");
1030 Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() + "</TD>");
1031 Out.prln("<TD>" + annotDiffer.getMissing() + "</TD>");
1032 Out.prln("<TD>" + annotDiffer.getSpurious() + "</TD>");
1033 }
1034
1035 Out.prln("<TD>");
1036
1037 if (annotDiffer1 != null) {
1039
1040 if (annotDiffer1.getPrecisionAverage()
1041 < annotDiffer.getPrecisionAverage()) {
1042 Out.prln("<P><Font color=blue> ");
1043 Out.prln(annotDiffer.getPrecisionAverage());
1044
1045 if(!isMoreInfoMode) {
1046 Out.pr("<BR>Precision increase on human-marked from ");
1047 Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1048 Out.prln(annotDiffer.getPrecisionAverage());
1049 }
1050 Out.prln(" </Font></P>");
1051 }
1052 else if (annotDiffer1.getPrecisionAverage()
1053 > annotDiffer.getPrecisionAverage()) {
1054 Out.prln("<P><Font color=red> ");
1055 Out.prln(annotDiffer.getPrecisionAverage());
1056
1057 if(!isMoreInfoMode) {
1058 Out.pr("<BR>Precision decrease on human-marked from ");
1059 Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1060 Out.prln(annotDiffer.getPrecisionAverage());
1061 }
1062 Out.prln(" </Font></P>");
1063 }
1064 else
1065 Out.prln("<P> " + (double) annotDiffer.getPrecisionAverage() + " </P>");
1066 }
1067 else
1068 Out.prln("<P> " + annotDiffer.getPrecisionAverage() + " </P>");
1069
1070 Out.prln("</TD>");
1071
1072 Out.prln("<TD>");
1073
1074 if (annotDiffer1 != null) {
1076
1077 if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1078 Out.prln("<P><Font color=blue> ");
1079 Out.prln(annotDiffer.getRecallAverage());
1080
1081 if(!isMoreInfoMode) {
1082 Out.pr("<BR>Recall increase on human-marked from ");
1083 Out.pr(annotDiffer1.getRecallAverage() + " to ");
1084 Out.prln(annotDiffer.getRecallAverage());
1085 }
1086 Out.prln(" </Font></P>");
1087 }
1088 else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1089 Out.prln("<P><Font color=red> ");
1090 Out.prln(annotDiffer.getRecallAverage());
1091
1092 if(!isMoreInfoMode) {
1093 Out.pr("<BR>Recall decrease on human-marked from ");
1094 Out.pr(annotDiffer1.getRecallAverage() + " to ");
1095 Out.prln(annotDiffer.getRecallAverage());
1096 }
1097 Out.prln(" </Font></P>");
1098 }
1099 else
1100 Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1101 } else
1102 Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1103
1104
1105 Out.prln("</TD>");
1106
1107 if ( isVerboseMode ) {
1109 Out.prln("<TD>");
1110 if (annotDiffer.getRecallAverage() < threshold) {
1111 printAnnotations(annotDiffer, markedDoc, cleanDoc);
1112 }
1113 else {
1114 Out.prln(" ");
1115 }
1116 Out.prln("</TD>");
1117 }
1118
1119 Out.prln("</TR>");
1120
1121 if(isMoreInfoMode && annotDiffer1 != null
1123 && (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1124 || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1125 ) {
1126
1127 Out.prln("<TR>");
1128 Out.prln("<TD> " + annotType + "_old" + "</TD>");
1129
1130 Out.prln("<TD>" + annotDiffer1.getCorrectMatches() + "</TD>");
1131 Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() + "</TD>");
1132 Out.prln("<TD>" + annotDiffer1.getMissing() + "</TD>");
1133 Out.prln("<TD>" + annotDiffer1.getSpurious() + "</TD>");
1134
1135 Out.prln("<TD>");
1136 if (annotDiffer1.getPrecisionAverage() < annotDiffer.getPrecisionAverage())
1137
1138 Out.prln("<P><Font color=blue> " + annotDiffer1.getPrecisionAverage()
1139 + "</Font></P>");
1140 else if (annotDiffer1.getPrecisionAverage() > annotDiffer.getPrecisionAverage())
1141 Out.prln(
1142 "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1143 + " </Font></P>");
1144 else
1145 Out.prln(annotDiffer1.getPrecisionAverage());
1146
1147 Out.prln("</TD>");
1148
1149 Out.prln("<TD>");
1150 if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1151 Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1152 + " </Font></P>");
1153 else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1154 Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1155 + " </Font></P>");
1156 else
1157 Out.prln(annotDiffer1.getRecallAverage());
1158
1159 Out.prln("</TD>");
1160
1161 if ( isVerboseMode ) {
1163
1165 Out.prln("<TD>");
1166 if (annotDiffer.getRecallAverage() < threshold) {
1167 printAnnotations(annotDiffer, markedDoc, cleanDoc);
1168 }
1169 else {
1170 Out.prln(" ");
1171 }
1172 Out.prln("</TD>");
1173 }
1174 Out.prln("</TR>");
1175 }
1177 if (isMoreInfoMode && errDir != null)
1178 storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1179 } Out.prln("</TABLE>");
1181
1182 try {
1183 if(errWriter != null)
1184 errWriter.close();
1185 }
1186 catch (Exception ex) {
1187 Out.prln("Exception on close of error file " + errWriter + ": "
1188 + ex.getMessage());
1189 }
1190 }
1192 protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1193 File errDir)
1194 throws ResourceInstantiationException {
1195
1196 printTableHeader();
1198
1199 Writer errWriter = null;
1201 if (isMoreInfoMode && errDir != null) {
1202 StringBuffer docName = new StringBuffer(keyDoc.getName());
1203 docName.replace(
1204 keyDoc.getName().lastIndexOf("."),
1205 docName.length(),
1206 ".err");
1207 File errFile = new File(errDir, docName.toString());
1208 String encoding = ((gate.corpora.DocumentImpl)keyDoc).getEncoding();
1209 try {
1210 errWriter = new FileWriter(errFile, false);
1211
1219 }
1220 catch (Exception ex) {
1221 Out.prln("Exception when creating the error file " + errFile + ": "
1222 + ex.getMessage());
1223 errWriter = null;
1224 }
1225 }
1226
1227 for (int jj= 0; jj< annotTypes.size(); jj++) {
1228 String annotType = (String) annotTypes.get(jj);
1229
1230 AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1231 if (annotDiff == null)
1233 continue;
1234
1235 docNumber++;
1237 updateStatistics(annotDiff, annotType);
1239
1240 Out.prln("<TR>");
1241 Out.prln("<TD>" + annotType + "</TD>");
1242
1243 if(isMoreInfoMode) {
1244 Out.prln("<TD>" + annotDiff.getCorrectMatches() + "</TD>");
1245 Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() + "</TD>");
1246 Out.prln("<TD>" + annotDiff.getMissing() + "</TD>");
1247 Out.prln("<TD>" + annotDiff.getSpurious() + "</TD>");
1248 }
1249
1250 Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
1251 Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
1252 if ( isVerboseMode ) {
1254 Out.prln("<TD>");
1255 if (annotDiff.getRecallAverage() < threshold) {
1256 printAnnotations(annotDiff, keyDoc, respDoc);
1257 }
1258 else {
1259 Out.prln(" ");
1260 }
1261 Out.prln("</TD>");
1262 }
1263 Out.prln("</TR>");
1264
1265 if (isMoreInfoMode && errDir != null)
1266 storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1267 } Out.prln("</TABLE>");
1269
1270 try {
1271 if(errWriter != null)
1272 errWriter.close();
1273 }
1274 catch (Exception ex) {
1275 Out.prln("Exception on close of error file " + errWriter + ": "
1276 + ex.getMessage());
1277 }
1278 }
1280 protected void printTableHeader() {
1281 Out.prln("<TABLE BORDER=1");
1282 Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1283
1284 if(isMoreInfoMode)
1285 Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1286 + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1287
1288 Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1289
1290 if (isVerboseMode)
1291 Out.pr("<TD><B>Annotations</B></TD>");
1292
1293 Out.prln("</TR>");
1294 }
1295
1296 protected void updateStatistics(AnnotationDiffer annotDiffer, String annotType){
1297 double precisionAverage = ((double)(annotDiffer.getPrecisionLenient() + annotDiffer.getPrecisionStrict()) /
1298(double)(2.0));
1299 precisionSum += precisionAverage;
1300
1301 double recallAverage = ((double)(annotDiffer.getRecallLenient() + annotDiffer.getRecallStrict()) / (double) (2.0));
1302 recallSum += recallAverage;
1303
1304 double fMeasureAverage = ((double) (annotDiffer.getFMeasureLenient(1) + annotDiffer.getFMeasureStrict(1)) /
1305(double) (2.0));
1306 fMeasureSum += fMeasureAverage;
1307
1308 Double oldPrecision = (Double) precisionByType.get(annotType);
1309 if (oldPrecision == null)
1310 precisionByType.put(annotType, new Double(precisionAverage));
1311 else
1312 precisionByType.put(annotType, new Double(oldPrecision.doubleValue() + precisionAverage));
1313
1314 Integer precCount = (Integer) prCountByType.get(annotType);
1315 if (precCount == null)
1316 prCountByType.put(annotType, new Integer(1));
1317 else
1318 prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1319
1320
1321 Double oldFMeasure = (Double) fMeasureByType.get(annotType);
1322 if (oldFMeasure == null)
1323 fMeasureByType.put(annotType, new Double(fMeasureAverage));
1324 else
1325 fMeasureByType.put(annotType, new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1326
1327 Integer fCount = (Integer) fMeasureCountByType.get(annotType);
1328 if (fCount == null)
1329 fMeasureCountByType.put(annotType, new Integer(1));
1330 else
1331 fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1332
1333 Double oldRecall = (Double) recallByType.get(annotType);
1334 if (oldRecall == null)
1335 recallByType.put(annotType, new Double(recallAverage));
1336 else
1337 recallByType.put(annotType, new Double(oldRecall.doubleValue() + recallAverage));
1338
1339 Integer recCount = (Integer) recCountByType.get(annotType);
1340 if (recCount == null)
1341 recCountByType.put(annotType, new Integer(1));
1342 else
1343 recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1344
1345 Long oldMissingNo = (Long) missingByType.get(annotType);
1347 if (oldMissingNo == null)
1348 missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1349 else
1350 missingByType.put(annotType, new Long(oldMissingNo.longValue() + annotDiffer.getMissing()));
1351
1352 Long oldCorrectNo = (Long) correctByType.get(annotType);
1353 if (oldCorrectNo == null)
1354 correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1355 else
1356 correctByType.put(annotType, new Long(oldCorrectNo.longValue() + annotDiffer.getCorrectMatches()));
1357
1358 Long oldPartialNo = (Long) partialByType.get(annotType);
1359 if (oldPartialNo == null)
1360 partialByType.put(annotType, new Long(annotDiffer.getPartiallyCorrectMatches()));
1361 else
1362 partialByType.put(annotType, new Long(oldPartialNo.longValue() + annotDiffer.getPartiallyCorrectMatches()));
1363
1364 Long oldSpuriousNo = (Long) spurByType.get(annotType);
1365 if (oldSpuriousNo == null)
1366 spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1367 else
1368 spurByType.put(annotType, new Long(oldSpuriousNo.longValue() + annotDiffer.getSpurious()));
1369 }
1370
1371
1375 protected void updateStatisticsProc(AnnotationDiffer annotDiffer, String annotType){
1376 hasProcessed = true;
1377 double precisionAverage = ((double)(annotDiffer.getPrecisionLenient() + annotDiffer.getPrecisionStrict()) /
1378(double)(2.0));
1379 proc_precisionSum += precisionAverage;
1380
1381 double recallAverage = ((double)(annotDiffer.getRecallLenient() + annotDiffer.getRecallStrict()) / (double) (2.0));
1382 proc_recallSum += recallAverage;
1383
1384 double fMeasureAverage = ((double) (annotDiffer.getFMeasureLenient(1) + annotDiffer.getFMeasureStrict(1)) /
1385(double) (2.0));
1386 proc_fMeasureSum += fMeasureAverage;
1387
1388 Double oldPrecision = (Double) proc_precisionByType.get(annotType);
1389 if (oldPrecision == null)
1390 proc_precisionByType.put(annotType, new Double(precisionAverage));
1391 else
1392 proc_precisionByType.put(annotType,
1393 new Double(oldPrecision.doubleValue() +
1394 precisionAverage));
1395 Integer precCount = (Integer) proc_prCountByType.get(annotType);
1396 if (precCount == null)
1397 proc_prCountByType.put(annotType, new Integer(1));
1398 else
1399 proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1400
1401
1402 Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType);
1403 if (oldFMeasure == null)
1404 proc_fMeasureByType.put(annotType,
1405 new Double(fMeasureAverage));
1406 else
1407 proc_fMeasureByType.put(annotType,
1408 new Double(oldFMeasure.doubleValue() +
1409 fMeasureAverage));
1410 Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType);
1411 if (fCount == null)
1412 proc_fMeasureCountByType.put(annotType, new Integer(1));
1413 else
1414 proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1415
1416 Double oldRecall = (Double) proc_recallByType.get(annotType);
1417 if (oldRecall == null)
1418 proc_recallByType.put(annotType,
1419 new Double(recallAverage));
1420 else
1421 proc_recallByType.put(annotType,
1422 new Double(oldRecall.doubleValue() +
1423 recallAverage));
1424 Integer recCount = (Integer) proc_recCountByType.get(annotType);
1425 if (recCount == null)
1426 proc_recCountByType.put(annotType, new Integer(1));
1427 else
1428 proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1429
1430 Long oldMissingNo = (Long) proc_missingByType.get(annotType);
1432 if (oldMissingNo == null)
1433 proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1434 else
1435 proc_missingByType.put(annotType,
1436 new Long(oldMissingNo.longValue() +
1437 annotDiffer.getMissing()));
1438
1439 Long oldCorrectNo = (Long) proc_correctByType.get(annotType);
1440 if (oldCorrectNo == null)
1441 proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1442 else
1443 proc_correctByType.put(annotType,
1444 new Long(oldCorrectNo.longValue() +
1445 annotDiffer.getCorrectMatches()));
1446
1447 Long oldPartialNo = (Long) proc_partialByType.get(annotType);
1448 if (oldPartialNo == null)
1449 proc_partialByType.put(annotType, new Long(annotDiffer.getPartiallyCorrectMatches()));
1450 else
1451 proc_partialByType.put(annotType,
1452 new Long(oldPartialNo.longValue() +
1453 annotDiffer.getPartiallyCorrectMatches()));
1454
1455 Long oldSpuriousNo = (Long) proc_spurByType.get(annotType);
1456 if (oldSpuriousNo == null)
1457 proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1458 else
1459 proc_spurByType.put(annotType,
1460 new Long(oldSpuriousNo.longValue() +
1461 annotDiffer.getSpurious()));
1462 }
1463
1464 public void printStatistics() {
1465
1466 Out.prln("<H2> Statistics </H2>");
1467
1468
1499 if (annotTypes == null) {
1500 Out.prln("No types given for evaluation, cannot obtain precision/recall");
1501 return;
1502 }
1503 Out.prln("<table border=1>");
1504 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1505 "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1506 "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1507 "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1508 String annotType;
1509 for (int i = 0; i < annotTypes.size(); i++) {
1510 annotType = (String) annotTypes.get(i);
1511 printStatsForType(annotType);
1512 } Out.prln("</table>");
1514 }
1516 protected void printStatsForType(String annotType){
1517 long correct = (correctByType.get(annotType) == null)? 0 :
1518 ((Long)correctByType.get(annotType)).longValue();
1519 long partial = (partialByType.get(annotType) == null)? 0 :
1520 ((Long)partialByType.get(annotType)).longValue();
1521 long spurious = (spurByType.get(annotType) == null)? 0 :
1522 ((Long)spurByType.get(annotType)).longValue();
1523 long missing = (missingByType.get(annotType) == null)? 0:
1524 ((Long)missingByType.get(annotType)).longValue();
1525 long actual = correct + partial + spurious;
1526 long possible = correct + partial + missing;
1527 double precision = (correct + 0.5 * partial) / actual;
1530 double recall = (correct + 0.5*partial)/possible;
1532 double fmeasure =
1534 ((beta*beta + 1)*precision*recall)
1535 /
1536 ((beta*beta*precision) + recall);
1537
1538 long proc_correct=0;
1539 long proc_partial=0;
1540 long proc_spurious=0;
1541 long proc_missing=0;
1542 long proc_actual=0;
1543 long proc_possible=0;
1544 double proc_precision=0;
1545 double proc_recall=0;
1546 double proc_fmeasure=0;
1547
1548 if(hasProcessed) {
1549 proc_correct = (proc_correctByType.get(annotType) == null)? 0 :
1551 ((Long)proc_correctByType.get(annotType)).longValue();
1552 proc_partial = (proc_partialByType.get(annotType) == null)? 0 :
1553 ((Long)proc_partialByType.get(annotType)).longValue();
1554 proc_spurious = (proc_spurByType.get(annotType) == null)? 0 :
1555 ((Long)proc_spurByType.get(annotType)).longValue();
1556 proc_missing = (proc_missingByType.get(annotType) == null)? 0:
1557 ((Long)proc_missingByType.get(annotType)).longValue();
1558 proc_actual = proc_correct + proc_partial + proc_spurious;
1559 proc_possible = proc_correct + proc_partial + proc_missing;
1560 proc_precision = (proc_correct + 0.5*proc_partial)/proc_actual;
1563 proc_recall = (proc_correct + 0.5*proc_partial)/proc_possible;
1565 proc_fmeasure =
1567 ((beta*beta + 1)*proc_precision*proc_recall)
1568 /
1569 ((beta*beta*proc_precision) + proc_recall);
1570 }
1571
1572 Out.prln("<TR>");
1574 if(hasProcessed)
1575 Out.prln("<TD>" + annotType+ "_new" + "</TD>");
1576 else
1577 Out.prln("<TD>" + annotType + "</TD>");
1578
1579 Out.prln("<TD>" + correct + "</TD>");
1580 Out.prln("<TD>" + partial + "</TD>");
1581 Out.prln("<TD>" + missing + "</TD>");
1582 Out.prln("<TD>" + spurious + "</TD>");
1583
1584 String strPrec = (isMoreInfoMode)?
1585 avgPrint(precision, 4)
1586 :Double.toString(precision);
1587 String strRec = (isMoreInfoMode)?
1588 avgPrint(recall, 4)
1589 :Double.toString(recall);
1590 String strFmes = (isMoreInfoMode)?
1591 avgPrint(fmeasure, 4)
1592 :Double.toString(fmeasure);
1593
1594 if(hasProcessed && (precision < proc_precision))
1595 Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1596 else if(hasProcessed && (precision > proc_precision))
1597 Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1598 else
1599 Out.prln("<TD>" + strPrec + "</TD>");
1600 if(hasProcessed && (recall < proc_recall))
1601 Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1602 else if(hasProcessed && (recall > proc_recall))
1603 Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1604 else
1605 Out.prln("<TD>" + strRec + "</TD>");
1606 Out.prln("<TD>" + strFmes + "</TD>");
1607 Out.prln("</TR>");
1608
1609 if(hasProcessed) {
1610 Out.prln("<TR>");
1612 Out.prln("<TD>" + annotType + "_old" + "</TD>");
1613
1614 Out.prln("<TD>" + proc_correct + "</TD>");
1615 Out.prln("<TD>" + proc_partial + "</TD>");
1616 Out.prln("<TD>" + proc_missing + "</TD>");
1617 Out.prln("<TD>" + proc_spurious + "</TD>");
1618
1619 String strProcPrec = (isMoreInfoMode)?
1620 avgPrint(proc_precision, 4)
1621 :Double.toString(proc_precision);
1622 String strProcRec = (isMoreInfoMode)?
1623 avgPrint(proc_recall, 4)
1624 :Double.toString(proc_recall);
1625 String strProcFmes = (isMoreInfoMode)?
1626 avgPrint(proc_fmeasure, 4)
1627 :Double.toString(proc_fmeasure);
1628
1629 if(precision < proc_precision)
1630 Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1631 else if(precision > proc_precision)
1632 Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1633 else
1634 Out.prln("<TD>" + strProcPrec + "</TD>");
1635 if(recall < proc_recall)
1636 Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1637 else if(recall > proc_recall)
1638 Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1639 else
1640 Out.prln("<TD>" + strProcRec + "</TD>");
1641 Out.prln("<TD>" + strProcFmes + "</TD>");
1642 Out.prln("</TR>");
1643 }
1644 }
1646 protected String avgPrint(double value, int count) {
1648 double newvalue;
1649 double power = Math.pow(10, count);
1650 newvalue = Math.round( value * power )/ power;
1651 return Double.toString(newvalue);
1652 }
1653
1654
1655 private double precisionSumCalc = 0;
1656 private double recallSumCalc = 0;
1657 private double fMeasureSumCalc = 0;
1658
1659 public double getPrecisionAverageCalc() {
1660 return precisionSumCalc;
1661 }
1662
1663 public double getRecallAverageCalc() {
1664 return recallSumCalc;
1665 }
1666
1667 public double getFmeasureAverageCalc() {
1668 return fMeasureSumCalc;
1669 }
1670
1671 protected void calculateAvgTotal() {
1672 long correct, partial, spurious, missing;
1673 long correctSum, partialSum, spuriousSum, missingSum;
1674
1675 if (annotTypes == null) {
1676 return;
1677 }
1678 correctSum = partialSum = spuriousSum = missingSum = 0;
1679
1680 String annotType;
1681 for (int i = 0; i < annotTypes.size(); i++) {
1682 annotType = (String) annotTypes.get(i);
1683 correct = (correctByType.get(annotType) == null)? 0 :
1684 ((Long)correctByType.get(annotType)).longValue();
1685 partial = (partialByType.get(annotType) == null)? 0 :
1686 ((Long)partialByType.get(annotType)).longValue();
1687 spurious = (spurByType.get(annotType) == null)? 0 :
1688 ((Long)spurByType.get(annotType)).longValue();
1689 missing = (missingByType.get(annotType) == null)? 0:
1690 ((Long)missingByType.get(annotType)).longValue();
1691 correctSum += correct;
1692 partialSum += partial;
1693 spuriousSum += spurious;
1694 missingSum += missing;
1695 }
1697 long actual = correctSum + partialSum + spuriousSum;
1698 long possible = correctSum + partialSum + missingSum;
1699
1700 if(actual == 0) {
1701 precisionSumCalc = 0;
1702 }
1703 else {
1704 precisionSumCalc = (correctSum + 0.5 * partialSum) / actual;
1705 }
1706
1707 if(possible == 0) {
1708 recallSumCalc = 0;
1709 }
1710 else {
1711 recallSumCalc = (correctSum + 0.5 * partialSum) / actual;
1712 }
1713
1714 if(precisionSumCalc == 0 && recallSumCalc == 0) {
1715 fMeasureSumCalc = 0;
1716 }
1717 else {
1718 fMeasureSumCalc =
1719 ((beta*beta + 1)*precisionSumCalc*recallSumCalc)
1720 /
1721 ((beta*beta*precisionSumCalc) + recallSumCalc);
1722
1723 }
1724 }
1726 protected AnnotationDiffer measureDocs(
1727 Document keyDoc, Document respDoc, String annotType)
1728 throws ResourceInstantiationException {
1729
1730 if (keyDoc == null || respDoc == null)
1731 return null;
1732
1733 if (annotSetName != null
1734 && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1735 return null;
1736 else if ((annotSetName == null || annotSetName.equals(""))
1737 && keyDoc.getAnnotations().get(annotType) == null)
1738 return null;
1739
1740 AnnotationDiffer annotDiffer = new AnnotationDiffer();
1742 annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1744 AnnotationSet keys, responses;
1746 if(annotSetName == null || annotSetName.equals("")) {
1747 keys = keyDoc.getAnnotations();
1748 responses = respDoc.getAnnotations();
1749 } else {
1750 keys = keyDoc.getAnnotations(annotSetName);
1751 System.out.println("Keys : "+keys.size());
1752 responses = respDoc.getAnnotations(outputSetName);
1753 System.out.println("Resp : "+responses.size());
1754 }
1755
1756 List pairings = annotDiffer.calculateDiff(keys,responses);
1758 return annotDiffer;
1759 }
1761 protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
1762 Document keyDoc, Document respDoc, Writer errFileWriter) {
1763 if(errFileWriter == null) return;
1765 try {
1766 Comparator comp = new OffsetComparator();
1768 TreeSet sortedSet = new TreeSet(comp);
1769 Set missingSet =
1770 annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1771 sortedSet.clear();
1772 sortedSet.addAll(missingSet);
1773 storeAnnotations(type+".miss", sortedSet, keyDoc, errFileWriter);
1774 Set spuriousSet =
1775 annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1776 sortedSet.clear();
1777 sortedSet.addAll(spuriousSet);
1778 storeAnnotations(type+".spur", sortedSet, respDoc, errFileWriter);
1779 Set partialSet =
1780 annotDiffer.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
1781 sortedSet.clear();
1782 sortedSet.addAll(partialSet);
1783 storeAnnotations(type+".part", sortedSet, respDoc, errFileWriter);
1784 } catch (Exception ex) {
1785 Out.prln("Exception on close of error file "+errFileWriter+": "
1786 +ex.getMessage());
1787 }
1788 }
1790 protected void storeAnnotations(String type, Set set, Document doc,
1791 Writer file) throws IOException{
1792
1793 if (set == null || set.isEmpty())
1794 return;
1795
1796 Iterator iter = set.iterator();
1797 Annotation ann;
1798 while (iter.hasNext()) {
1799 ann = (Annotation) iter.next();
1800 file.write(type);
1801 file.write(".");
1802 file.write(doc.getContent().toString().substring(
1803 ann.getStartNode().getOffset().intValue(),
1804 ann.getEndNode().getOffset().intValue()));
1805 file.write(".");
1806 file.write(ann.getStartNode().getOffset().toString());
1807 file.write(".");
1808 file.write(ann.getEndNode().getOffset().toString());
1809 file.write("\n");
1810 } }
1813 protected void printAnnotations(AnnotationDiffer annotDiff,
1814 Document keyDoc, Document respDoc) {
1815 Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
1816 Set missingSet =
1817 annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1818 printAnnotations(missingSet, keyDoc);
1819 Out.prln("<BR>");
1820
1821 Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
1822 Set spuriousSet =
1823 annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1824 printAnnotations(spuriousSet, respDoc);
1825 Out.prln("</BR>");
1826
1827 Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
1828 Set partialSet =
1829 annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
1830 printAnnotations(partialSet, respDoc);
1831 }
1832
1833 protected void printAnnotations(Set set, Document doc) {
1834 if (set == null || set.isEmpty())
1835 return;
1836
1837 Iterator iter = set.iterator();
1838 while (iter.hasNext()) {
1839 Annotation ann = (Annotation) iter.next();
1840 Out.prln(
1841 "<B>" +
1842 doc.getContent().toString().substring(
1843 ann.getStartNode().getOffset().intValue(),
1844 ann.getEndNode().getOffset().intValue()) +
1845 "</B>: <I>[" + ann.getStartNode().getOffset() +
1846 "," + ann.getEndNode().getOffset() + "]</I>"
1847 );
1849 } }
1852
1855 private File startDir;
1856 private File currDir;
1857 private static List annotTypes;
1858
1859 private Controller application = null;
1860 private File applicationFile = null;
1861
1862 private double precisionSum = 0;
1866 private double recallSum = 0;
1867 private double fMeasureSum = 0;
1868 private HashMap precisionByType = new HashMap();
1869 private HashMap prCountByType = new HashMap();
1870 private HashMap recallByType = new HashMap();
1871 private HashMap recCountByType = new HashMap();
1872 private HashMap fMeasureByType = new HashMap();
1873 private HashMap fMeasureCountByType = new HashMap();
1874
1875 private HashMap missingByType = new HashMap();
1876 private HashMap spurByType = new HashMap();
1877 private HashMap correctByType = new HashMap();
1878 private HashMap partialByType = new HashMap();
1879
1880 static boolean hasProcessed = false;
1882 private double proc_precisionSum = 0;
1883 private double proc_recallSum = 0;
1884 private double proc_fMeasureSum = 0;
1885 private HashMap proc_precisionByType = new HashMap();
1886 private HashMap proc_prCountByType = new HashMap();
1887 private HashMap proc_recallByType = new HashMap();
1888 private HashMap proc_recCountByType = new HashMap();
1889 private HashMap proc_fMeasureByType = new HashMap();
1890 private HashMap proc_fMeasureCountByType = new HashMap();
1891
1892 private HashMap proc_missingByType = new HashMap();
1893 private HashMap proc_spurByType = new HashMap();
1894 private HashMap proc_correctByType = new HashMap();
1895 private HashMap proc_partialByType = new HashMap();
1896
1897 double beta = 1;
1898
1899 private int docNumber = 0;
1900
1901
1905 private boolean isGenerateMode = false;
1906
1907
1910 private boolean isVerboseMode = false;
1911
1912
1915 private boolean isMoreInfoMode = false;
1916
1917
1921 private Set diffFeaturesSet;
1922
1923
1927 private boolean isMarkedStored = false;
1928 private boolean isMarkedClean = false;
1929 private boolean isMarkedDS = false;
1931
1932 private String annotSetName = "Key";
1933 private String outputSetName = null;
1934
1935 private double threshold = 0.5;
1936 private Properties configs = new Properties();
1937 private static int corpusWordCount = 0;
1938
1939 private String documentEncoding = "";
1940
1941
1942 private static String usage =
1943 "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
1944 +"[-verbose] [-moreinfo] directory-name application";
1945
1946}
1947