1   /*
2    *  CorpusBenchmarkTool.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/Oct/2001
12   *
13   *  $Id: CorpusBenchmarkTool.java,v 1.41 2003/06/24 22:03:04 kalina Exp $
14   */
15  
16  package gate.util;
17  
18  import java.util.*;
19  import java.io.*;
20  
21  import gate.*;
22  import gate.creole.*;
23  import gate.util.*;
24  import gate.persist.*;
25  import gate.creole.tokeniser.*;
26  import gate.creole.gazetteer.*;
27  import gate.creole.splitter.*;
28  import gate.creole.orthomatcher.*;
29  import gate.creole.annotransfer.*;
30  import gate.annotation.*;
31  
32  public class CorpusBenchmarkTool {
33    private static final String MARKED_DIR_NAME = "marked";
34    private static final String CLEAN_DIR_NAME = "clean";
35    private static final String CVS_DIR_NAME = "Cvs";
36    private static final String PROCESSED_DIR_NAME = "processed";
37    private static final String ERROR_DIR_NAME = "err";
38  
39    private static final boolean DEBUG = true;
40  
41    public CorpusBenchmarkTool() {}
42  
43    public void initPRs() {
44      try {
45        if (applicationFile == null)
46          Out.prln("Application not set!");
47        Out.prln("App file is: " + applicationFile.getAbsolutePath());
48        application = (Controller) gate.util.persistence.PersistenceManager
49                                     .loadObjectFromFile(applicationFile);
50      } catch (Exception ex) {
51        throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
52      }
53    }//initPRs
54  
55    public void unloadPRs() {
56      //we have nothing to unload if no PRs are loaded
57      if (isMarkedStored)
58        return;
59  
60    }
61  
62    public void execute() {
63  /*
64      Out.prln("Flags Gen Cln Str Vrb Minf: "
65               + isGenerateMode +" "+ isMarkedClean +" "+ isMarkedStored
66               +" "+ isVerboseMode +" "+ isMoreInfoMode);
67  */
68      execute(startDir);
69      if (application != null) {
70        Iterator iter = new ArrayList(application.getPRs()).iterator();
71        while (iter.hasNext())
72          Factory.deleteResource((Resource) iter.next());
73        Factory.deleteResource(application);
74      }
75    }
76  
77    public void init() {
78      //first read the corpus_tool.properties file
79      File propFile = new File("corpus_tool.properties");
80      Out.prln(propFile.getAbsolutePath());
81      if (propFile.exists()) {
82        try {
83          InputStream inputStream = new FileInputStream(propFile);
84          this.configs.load(inputStream);
85          String thresholdString = this.configs.getProperty("threshold");
86          if (thresholdString != null && !thresholdString.equals("")) {
87            this.threshold = (new Double(thresholdString)).doubleValue();
88            Out.prln("New threshold is: " + this.threshold + "<P>\n");
89          }
90          String setName = this.configs.getProperty("annotSetName");
91          if (setName != null && !setName.equals("")) {
92            Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
93            this.annotSetName = setName;
94          }
95          setName = this.configs.getProperty("outputSetName");
96          if (setName != null && !setName.equals("")) {
97            Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
98            this.outputSetName = setName;
99          }
100         String encodingString = this.configs.getProperty("encoding");
101         if (encodingString != null && !encodingString.equals("")) {
102           this.documentEncoding = encodingString;
103           Out.prln("New encoding is: " + this.documentEncoding + "<P>\n");
104         }
105         String types = this.configs.getProperty("annotTypes");
106         if (types != null && !types.equals("")) {
107           Out.prln("Using annotation types from the properties file. <P>\n");
108           StringTokenizer strTok = new StringTokenizer(types, ";");
109           annotTypes = new ArrayList();
110           while (strTok.hasMoreTokens())
111             annotTypes.add(strTok.nextToken());
112         } else {
113           annotTypes = new ArrayList();
114           annotTypes.add("Organization");
115           annotTypes.add("Person");
116           annotTypes.add("Date");
117           annotTypes.add("Location");
118           annotTypes.add("Address");
119           annotTypes.add("Money");
120           annotTypes.add("Percent");
121           annotTypes.add("GPE");
122           annotTypes.add("Facility");
123         }
124         String features = this.configs.getProperty("annotFeatures");
125         HashSet result = new HashSet();
126         if (features != null && !features.equals("")) {
127           Out.pr("Using annotation features from the properties file. \n");
128           java.util.StringTokenizer tok =
129               new java.util.StringTokenizer(features, ";");
130           String current;
131           while(tok.hasMoreTokens()) {
132             current = tok.nextToken();
133             result.add(current);
134           } // while
135         }
136         diffFeaturesSet = result;
137         Out.prln("Features: "+diffFeaturesSet+" <P>\n");
138 
139       } catch (IOException ex) {
140         //just ignore the file and go on with the defaults
141         this.configs = new Properties();
142       }
143     } else
144       this.configs = new Properties();
145 
146 
147     //we only initialise the PRs if they are going to be used
148     //for processing unprocessed documents
149     if (!this.isMarkedStored)
150       initPRs();
151 
152   }
153 
154   public void execute(File dir) {
155     if (dir == null)
156       return;
157     //first set the current directory to be the given one
158     currDir = dir;
159 
160     File processedDir = null;
161     File cleanDir = null;
162     File markedDir = null;
163     File errorDir = null;
164 
165     ArrayList subDirs = new ArrayList();
166     File[] dirArray = currDir.listFiles();
167     if(dirArray == null) return;
168     for (int i = 0; i < dirArray.length; i++) {
169       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
170         continue;
171       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
172         cleanDir = dirArray[i];
173       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
174         markedDir = dirArray[i];
175       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
176         processedDir = dirArray[i];
177       else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
178         errorDir = dirArray[i];
179       else
180         subDirs.add(dirArray[i]);
181     }
182 
183     if(cleanDir == null) return;
184     Out.prln("Processing directory: " + currDir + "<P>");
185 
186     if (this.isGenerateMode)
187       generateCorpus(cleanDir, processedDir);
188     else
189       evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
190 
191     //if no more subdirs left, return
192     if (subDirs.isEmpty())
193       return;
194 
195     //there are more subdirectories to traverse, so iterate through
196     for (int j = 0; j < subDirs.size(); j++)
197       execute((File) subDirs.get(j));
198 
199   }//execute(dir)
200 
201 
202   public static void main(String[] args) throws GateException {
203     Out.prln("<HTML>");
204     Out.prln("<HEAD>");
205     Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
206     for(int argC=0; argC < args.length; ++argC)
207       Out.pr(args[argC]+" ");
208     Out.pr(" on " + new Date() + "</TITLE> </HEAD>");
209     Out.prln("<BODY>");
210     Out.prln("Please wait while GATE tools are initialised. <P>");
211     // initialise GATE
212     Gate.init();
213 
214     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
215 
216     List inputFiles = null;
217     if(args.length < 1) throw new GateException(usage);
218     int i = 0;
219     while (i < args.length && args[i].startsWith("-")) {
220       if(args[i].equals("-generate")) {
221         Out.prln("Generating the corpus... <P>");
222         corpusTool.setGenerateMode(true);
223       } else if (args[i].equals("-marked_clean")) {
224         Out.prln("Evaluating current grammars against human-annotated...<P>");
225         corpusTool.setMarkedClean(true);
226       } else if (args[i].equals("-marked_stored")) {
227         Out.prln("Evaluating stored documents against human-annotated...<P>");
228         corpusTool.setMarkedStored(true);
229       } else if (args[i].equals("-marked_ds")) {
230         Out.prln("Looking for marked docs in a datastore...<P>");
231         corpusTool.setMarkedDS(true);
232       } else if (args[i].equals("-verbose")) {
233         Out.prln("Running in verbose mode. Will generate annotation " +
234           "information when precision/recall are lower than " +
235           corpusTool.getThreshold() +"<P>");
236         corpusTool.setVerboseMode(true);
237       } else if (args[i].equals("-moreinfo")) {
238         Out.prln("Show more details in document table...<P>");
239         corpusTool.setMoreInfo(true);
240       }
241       i++; //just ignore the option, which we do not recognise
242     }//while
243 
244     String dirName = args[i];
245     File dir = new File(dirName);
246     if (!dir.isDirectory())
247       throw new GateException(usage);
248 
249     //get the last argument which is the application
250     i++;
251     String appName = args[i];
252     File appFile = new File(appName);
253     if (!appFile.isFile())
254       throw new GateException(usage);
255     else
256       corpusTool.setApplicationFile(appFile);
257 
258     corpusTool.init();
259     corpusWordCount = 0;
260 
261     Out.prln("Measuring annotaitions of types: " + corpusTool.annotTypes + "<P>");
262 
263     corpusTool.setStartDirectory(dir);
264     corpusTool.execute();
265 
266     //if we're not generating the corpus, then print the precision and recall
267     //statistics for the processed corpus
268     if (! corpusTool.getGenerateMode())
269       corpusTool.printStatistics();
270 
271     Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
272     Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
273     if(corpusWordCount == 0)
274       Out.prln("<BR>No Token annotations to count words in the corpus.");
275     else
276       Out.prln("<BR>Overall word count: " + corpusWordCount);
277 
278 
279     if(hasProcessed) {
280       Out.prln("<P>Old Processed: ");
281       Out.prln("<BR>Overall average precision: "
282                + corpusTool.getPrecisionAverageProc());
283       Out.prln("<BR>Overall average recall: "
284                + corpusTool.getRecallAverageProc());
285     }
286     Out.prln("<BR>Finished! <P>");
287     Out.prln("</BODY>");
288     Out.prln("</HTML>");
289 
290     System.exit(0);
291 
292   }//main
293 
294   public void setGenerateMode(boolean mode) {
295     isGenerateMode = mode;
296   }//setGenerateMode
297 
298   public boolean getGenerateMode() {
299     return isGenerateMode;
300   }//getGenerateMode
301 
302   public boolean getVerboseMode() {
303     return isVerboseMode;
304   }//getVerboseMode
305 
306   public void setVerboseMode(boolean mode) {
307     isVerboseMode = mode;
308   }//setVerboseMode
309 
310   public void setMoreInfo(boolean mode) {
311     isMoreInfoMode = mode;
312   } // setMoreInfo
313 
314   public boolean getMoreInfo() {
315     return isMoreInfoMode;
316   } // getMoreInfo
317 
318   public void setDiffFeaturesList(Set features) {
319     diffFeaturesSet = features;
320   } // setDiffFeaturesList
321 
322   public Set getDiffFeaturesList() {
323     return diffFeaturesSet;
324   } // getDiffFeaturesList
325 
326   public void setMarkedStored(boolean mode) {
327     isMarkedStored = mode;
328   }// setMarkedStored
329 
330 
331   public boolean getMarkedStored() {
332     return isMarkedStored;
333   }// getMarkedStored
334 
335   public void setMarkedClean(boolean mode) {
336     isMarkedClean = mode;
337   }//
338 
339   public boolean getMarkedClean() {
340     return isMarkedClean;
341   }//
342 
343   public void setMarkedDS(boolean mode) {
344     isMarkedDS = mode;
345   }//
346 
347   public boolean getMarkedDS() {
348     return isMarkedDS;
349   }//
350 
351   public void setApplicationFile(File newAppFile) {
352     applicationFile = newAppFile;
353   }
354 
355   /**
356    * Returns the average precision over the entire set of processed documents.
357    * <P>
358    * If the tool has been evaluating the original documents against the
359    * previously-stored automatically annotated ones, then the precision
360    * will be the average precision on those two sets. <P>
361    * If the tool was run in -marked mode, i.e., was evaluating the stored
362    * automatically processed ones against the human-annotated ones, then
363    * the precision will be the average precision on those two sets of documents.
364    */
365   public double getPrecisionAverage() {
366     return precisionSum/docNumber;
367   }
368 
369   /**
370    * Returns the average recall over the entire set of processed documents.
371    * <P>
372    * If the tool has been evaluating the original documents against the
373    * previously-stored automatically annotated ones, then the recall
374    * will be the average recall on those two sets. <P>
375    * If the tool was run in -marked mode, i.e., was evaluating the stored
376    * automatically processed ones against the human-annotated ones, then
377    * the recall will be the average recall on those two sets of documents.
378    */
379   public double getRecallAverage() {
380     return recallSum/docNumber;
381   }
382 
383   /** For processed documents */
384   public double getPrecisionAverageProc() {
385     return proc_precisionSum/docNumber;
386   }
387   public double getRecallAverageProc() {
388     return proc_recallSum/docNumber;
389   }
390 
391 
392   public boolean isGenerateMode() {
393     return isGenerateMode == true;
394   }//isGenerateMode
395 
396   public double getThreshold() {
397     return threshold;
398   }
399 
400   public void setThreshold(double newValue) {
401     threshold = newValue;
402   }
403 
404   public File getStartDirectory() {
405     return startDir;
406   }//getStartDirectory
407 
408   public void setStartDirectory(File dir) {
409     startDir = dir;
410   }//setStartDirectory
411 
412   protected void generateCorpus(File fileDir, File outputDir) {
413     //1. check if we have input files
414     if (fileDir == null)
415       return;
416     //2. create the output directory or clean it up if needed
417     File outDir = outputDir;
418     if (outputDir == null) {
419       outDir = new File(currDir, PROCESSED_DIR_NAME);
420     } else {
421       // get rid of the directory, coz datastore wants it clean
422       if (!Files.rmdir(outDir))
423         Out.prln("cannot delete old output directory: " + outDir);
424     }
425     outDir.mkdir();
426 
427     //create the datastore and process each document
428     try {
429       SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
430       sds.create();
431       sds.open();
432 
433       File[] files = fileDir.listFiles();
434       for (int i=0; i < files.length; i++) {
435         if (!files[i].isFile())
436           continue;
437         // create a document
438         Out.prln("Processing and storing document: " + files[i].toURL() +"<P>");
439 
440         FeatureMap params = Factory.newFeatureMap();
441         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
442         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
443 
444         // create the document
445         Document doc = (Document) Factory.createResource(
446           "gate.corpora.DocumentImpl", params
447         );
448 
449         doc.setName(files[i].getName());
450         if (doc == null)
451           continue;
452         processDocument(doc);
453         LanguageResource lr = sds.adopt(doc, null);
454         sds.sync(lr);
455         Factory.deleteResource(doc);
456         Factory.deleteResource(lr);
457       }//for
458       sds.close();
459     } catch (java.net.MalformedURLException ex) {
460       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
461     } catch (PersistenceException ex1) {
462       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
463     } catch (ResourceInstantiationException ex2) {
464       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
465     } catch (gate.security.SecurityException ex3) {
466       throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage());
467     }
468 
469   }//generateCorpus
470 
471   protected void evaluateCorpus(File fileDir,
472                     File processedDir, File markedDir,
473                     File errorDir) {
474     //1. check if we have input files and the processed Dir
475     if (fileDir == null || !fileDir.exists())
476       return;
477     if (processedDir == null || !processedDir.exists())
478       //if the user wants evaluation of marked and stored that's not possible
479       if (isMarkedStored) {
480         Out.prln("Cannot evaluate because no processed documents exist.");
481         return;
482       }
483       else
484         isMarkedClean = true;
485 
486     // create the error directory or clean it up if needed
487     File errDir = null;
488     if(isMoreInfoMode) {
489       errDir = errorDir;
490       if (errDir == null) {
491         errDir = new File(currDir, ERROR_DIR_NAME);
492       }
493       else {
494         // get rid of the directory, coz we wants it clean
495         if (!Files.rmdir(errDir))
496           Out.prln("cannot delete old error directory: " + errDir);
497       }
498       Out.prln("Create error directory: " + errDir + "<BR><BR>");
499       errDir.mkdir();
500     }
501 
502     //looked for marked texts only if the directory exists
503     boolean processMarked = markedDir != null && markedDir.exists();
504     if (!processMarked && (isMarkedStored || isMarkedClean)) {
505         Out.prln("Cannot evaluate because no human-annotated documents exist.");
506         return;
507     }
508 
509     if (isMarkedStored) {
510       evaluateMarkedStored(markedDir, processedDir, errDir);
511       return;
512     } else if (isMarkedClean) {
513       evaluateMarkedClean(markedDir, fileDir, errDir);
514       return;
515     }
516 
517     Document persDoc = null;
518     Document cleanDoc = null;
519     Document markedDoc = null;
520 
521     //open the datastore and process each document
522     try {
523       //open the data store
524       DataStore sds = Factory.openDataStore
525                       ("gate.persist.SerialDataStore",
526                        processedDir.toURL().toExternalForm());
527 
528       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
529       for (int i=0; i < lrIDs.size(); i++) {
530         String docID = (String) lrIDs.get(i);
531 
532         //read the stored document
533         FeatureMap features = Factory.newFeatureMap();
534         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
535         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
536         persDoc = (Document) Factory.createResource(
537                                     "gate.corpora.DocumentImpl",
538                                     features);
539 
540         if(isMoreInfoMode) {
541           StringBuffer errName = new StringBuffer(persDoc.getName());
542           errName.replace(
543             persDoc.getName().lastIndexOf("."),
544             persDoc.getName().length(),
545             ".err");
546           Out.prln("<H2>" +
547                    "<a href=err/" + errName.toString() + ">"
548                    + persDoc.getName() + "</a>" + "</H2>");
549         } else
550           Out.prln("<H2>" + persDoc.getName() + "</H2>");
551 
552         File cleanDocFile = new File(fileDir, persDoc.getName());
553         //try reading the original document from clean
554         if (! cleanDocFile.exists()) {
555           Out.prln("Warning: Cannot find original document " +
556                    persDoc.getName() + " in " + fileDir);
557         } else {
558           FeatureMap params = Factory.newFeatureMap();
559           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
560           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
561 
562           // create the document
563           cleanDoc = (Document) Factory.createResource(
564                                   "gate.corpora.DocumentImpl", params);
565           cleanDoc.setName(persDoc.getName());
566         }
567 
568         //try finding the marked document
569         StringBuffer docName = new StringBuffer(persDoc.getName());
570         if (! isMarkedDS) {
571           docName.replace(
572             persDoc.getName().lastIndexOf("."),
573             docName.length(),
574             ".xml");
575           File markedDocFile = new File(markedDir, docName.toString());
576           if (! processMarked || ! markedDocFile.exists()) {
577             Out.prln("Warning: Cannot find human-annotated document " +
578                      markedDocFile + " in " + markedDir);
579           } else {
580             FeatureMap params = Factory.newFeatureMap();
581             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
582             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
583 
584             // create the document
585             markedDoc = (Document) Factory.createResource(
586                                      "gate.corpora.DocumentImpl", params);
587             markedDoc.setName(persDoc.getName());
588           }
589         } else {
590           //open marked from a DS
591           //open the data store
592           DataStore sds1 = Factory.openDataStore
593                           ("gate.persist.SerialDataStore",
594                            markedDir.toURL().toExternalForm());
595 
596           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
597           boolean found = false;
598           int k = 0;
599           //search for the marked doc with the same name
600           while (k < lrIDs1.size() && !found) {
601             String docID1 = (String) lrIDs1.get(k);
602 
603             //read the stored document
604             FeatureMap features1 = Factory.newFeatureMap();
605             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
606             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
607             Document tempDoc = (Document) Factory.createResource(
608                                         "gate.corpora.DocumentImpl",
609                                         features1);
610             //check whether this is our doc
611             if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
612                  endsWith(persDoc.getName())) {
613               found = true;
614               markedDoc = tempDoc;
615             } else k++;
616           }
617         }
618 
619         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
620         if (persDoc != null)
621           Factory.deleteResource(persDoc);
622         if (cleanDoc != null)
623           Factory.deleteResource(cleanDoc);
624         if (markedDoc != null)
625           Factory.deleteResource(markedDoc);
626 
627       }//for loop through saved docs
628       sds.close();
629     } catch (java.net.MalformedURLException ex) {
630       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
631     } catch (PersistenceException ex1) {
632       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
633     } catch (ResourceInstantiationException ex2) {
634       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
635     }
636 
637   }//evaluateCorpus
638 
639   protected void evaluateMarkedStored(File markedDir, File storedDir, File errDir) {
640     Document persDoc = null;
641     Document cleanDoc = null;
642     Document markedDoc = null;
643 
644     //open the datastore and process each document
645     try {
646       //open the data store
647       DataStore sds = Factory.openDataStore
648                       ("gate.persist.SerialDataStore",
649                        storedDir.toURL().toExternalForm());
650 
651       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
652       for (int i=0; i < lrIDs.size(); i++) {
653         String docID = (String) lrIDs.get(i);
654 
655         //read the stored document
656         FeatureMap features = Factory.newFeatureMap();
657         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
658         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
659         persDoc = (Document) Factory.createResource(
660                                     "gate.corpora.DocumentImpl",
661                                     features);
662 
663         if(isMoreInfoMode) {
664           StringBuffer errName = new StringBuffer(persDoc.getName());
665           errName.replace(
666             persDoc.getName().lastIndexOf("."),
667             persDoc.getName().length(),
668             ".err");
669           Out.prln("<H2>" +
670                    "<a href=err/" + errName.toString() + ">"
671                    + persDoc.getName() + "</a>" + "</H2>");
672         } else
673           Out.prln("<H2>" + persDoc.getName() + "</H2>");
674 
675         if (! this.isMarkedDS) { //try finding the marked document as file
676           StringBuffer docName = new StringBuffer(persDoc.getName());
677           docName.replace(
678             persDoc.getName().lastIndexOf("."),
679             docName.length(),
680             ".xml");
681           File markedDocFile = new File(markedDir, docName.toString());
682           if (! markedDocFile.exists()) {
683             Out.prln("Warning: Cannot find human-annotated document " +
684                      markedDocFile + " in " + markedDir);
685           } else {
686             FeatureMap params = Factory.newFeatureMap();
687             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
688             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
689 
690             // create the document
691             markedDoc = (Document) Factory.createResource(
692                                      "gate.corpora.DocumentImpl", params);
693             markedDoc.setName(persDoc.getName());
694           }//find marked as file
695         } else {
696           try {
697             //open marked from a DS
698             //open the data store
699             DataStore sds1 = Factory.openDataStore
700                             ("gate.persist.SerialDataStore",
701                              markedDir.toURL().toExternalForm());
702 
703             List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
704             boolean found = false;
705             int k = 0;
706             //search for the marked doc with the same name
707             while (k < lrIDs1.size() && !found) {
708               String docID1 = (String) lrIDs1.get(k);
709 
710               //read the stored document
711               FeatureMap features1 = Factory.newFeatureMap();
712               features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
713               features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
714               Document tempDoc = (Document) Factory.createResource(
715                                           "gate.corpora.DocumentImpl",
716                                           features1);
717               //check whether this is our doc
718               if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
719                    endsWith(persDoc.getName())) {
720                 found = true;
721                 markedDoc = tempDoc;
722               } else k++;
723             }
724           } catch (java.net.MalformedURLException ex) {
725             Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
726           } catch (gate.persist.PersistenceException ex1) {
727             Out.prln("Error opening marked as a datastore (-marked_ds specified)");
728           } catch (gate.creole.ResourceInstantiationException ex2) {
729             Out.prln("Error opening marked as a datastore (-marked_ds specified)");
730           }
731         }
732 
733         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
734         if (persDoc != null)
735           Factory.deleteResource(persDoc);
736         if (markedDoc != null)
737           Factory.deleteResource(markedDoc);
738 
739       }//for loop through saved docs
740       sds.close();
741 
742     } catch (java.net.MalformedURLException ex) {
743       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
744     } catch (PersistenceException ex1) {
745       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
746     } catch (ResourceInstantiationException ex2) {
747       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
748     }
749 
750   }//evaluateMarkedStored
751 
752 
753   protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
754     Document persDoc = null;
755     Document cleanDoc = null;
756     Document markedDoc = null;
757 
758     File[] cleanDocs = cleanDir.listFiles();
759     for (int i = 0; i< cleanDocs.length; i++) {
760       if (!cleanDocs[i].isFile())
761         continue;
762 
763       //try reading the original document from clean
764       FeatureMap params = Factory.newFeatureMap();
765       try {
766         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
767       } catch (java.net.MalformedURLException ex) {
768         Out.prln("Cannot create document from file: " +
769           cleanDocs[i].getAbsolutePath());
770         continue;
771       }
772       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
773 
774       // create the document
775       try {
776         cleanDoc = (Document) Factory.createResource(
777                               "gate.corpora.DocumentImpl", params,
778                               null, cleanDocs[i].getName());
779       } catch (gate.creole.ResourceInstantiationException ex) {
780         Out.prln("Cannot create document from file: " +
781           cleanDocs[i].getAbsolutePath());
782         continue;
783       }
784 
785       if(isMoreInfoMode) {
786         StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
787         errName.replace(
788           cleanDocs[i].getName().lastIndexOf("."),
789           cleanDocs[i].getName().length(),
790           ".err");
791         Out.prln("<H2>" +
792                  "<a href=err/" + errName.toString() + ">"
793                  + cleanDocs[i].getName() + "</a>" + "</H2>");
794       } else
795         Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>");
796 
797       //try finding the marked document
798       if (! isMarkedDS) {
799         StringBuffer docName = new StringBuffer(cleanDoc.getName());
800         docName.replace(
801           cleanDoc.getName().lastIndexOf("."),
802           docName.length(),
803           ".xml");
804         File markedDocFile = new File(markedDir, docName.toString());
805         if (! markedDocFile.exists()) {
806           Out.prln("Warning: Cannot find human-annotated document " +
807                    markedDocFile + " in " + markedDir);
808           continue;
809         } else {
810           params = Factory.newFeatureMap();
811           try {
812             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
813           } catch (java.net.MalformedURLException ex) {
814             Out.prln("Cannot create document from file: " +
815               markedDocFile.getAbsolutePath());
816             continue;
817           }
818           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
819 
820           // create the document
821           try {
822             markedDoc = (Document) Factory.createResource(
823                                    "gate.corpora.DocumentImpl", params,
824                                    null, cleanDoc.getName());
825           } catch (gate.creole.ResourceInstantiationException ex) {
826             Out.prln("Cannot create document from file: " +
827               markedDocFile.getAbsolutePath());
828             continue;
829           }
830 
831         }//if markedDoc exists
832       } else {
833         try {
834           //open marked from a DS
835           //open the data store
836           DataStore sds1 = Factory.openDataStore
837                           ("gate.persist.SerialDataStore",
838                            markedDir.toURL().toExternalForm());
839 
840           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
841           boolean found = false;
842           int k = 0;
843           //search for the marked doc with the same name
844           while (k < lrIDs1.size() && !found) {
845             String docID1 = (String) lrIDs1.get(k);
846 
847             //read the stored document
848             FeatureMap features1 = Factory.newFeatureMap();
849             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
850             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
851             Document tempDoc = (Document) Factory.createResource(
852                                         "gate.corpora.DocumentImpl",
853                                         features1);
854             //check whether this is our doc
855             if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
856                  endsWith(cleanDoc.getName())) {
857               found = true;
858               markedDoc = tempDoc;
859             } else k++;
860           }
861         } catch (java.net.MalformedURLException ex) {
862           Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
863         } catch (gate.persist.PersistenceException ex1) {
864           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
865         } catch (gate.creole.ResourceInstantiationException ex2) {
866           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
867         }
868       } //if using a DS for marked
869 
870       try {
871         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
872       } catch (gate.creole.ResourceInstantiationException ex) {
873 ex.printStackTrace();
874         Out.prln("Evaluate failed on document: " + cleanDoc.getName());
875       }
876       if (persDoc != null)
877         Factory.deleteResource(persDoc);
878       if (cleanDoc != null)
879         Factory.deleteResource(cleanDoc);
880       if (markedDoc != null)
881         Factory.deleteResource(markedDoc);
882 
883     }//for loop through clean docs
884 
885 
886   }//evaluateMarkedClean
887 
888   protected void processDocument(Document doc) {
889     try {
890       if (application instanceof CorpusController) {
891         Corpus tempCorpus = Factory.newCorpus("temp");
892         tempCorpus.add(doc);
893         ((CorpusController)application).setCorpus(tempCorpus);
894         application.execute();
895         Factory.deleteResource(tempCorpus);
896         tempCorpus = null;
897       } else {
898         Iterator iter = application.getPRs().iterator();
899         while (iter.hasNext())
900           ((ProcessingResource) iter.next()).setParameterValue("document", doc);
901         application.execute();
902       }
903     } catch (ResourceInstantiationException ex) {
904       throw new RuntimeException("Error executing application: "
905                                     + ex.getMessage());
906     } catch (ExecutionException ex) {
907       throw new RuntimeException("Error executing application: "
908                                     + ex.getMessage());
909     }
910   }
911 
912   protected void evaluateDocuments(Document persDoc,
913                     Document cleanDoc, Document markedDoc,
914                     File errDir)
915                         throws ResourceInstantiationException {
916     if (cleanDoc == null && markedDoc == null)
917       return;
918 
919     //we've got no types to compare
920     if (annotTypes == null || annotTypes.isEmpty())
921       return;
922 
923     if (cleanDoc != null && !isMarkedStored) {
924 
925       processDocument(cleanDoc);
926 
927 
928       int wordCount = countWords(cleanDoc);
929       if(wordCount == 0)
930         Out.prln("<BR>No Token annotations to count words in the document.");
931       else
932         Out.prln("<BR>Word count: " + wordCount);
933       corpusWordCount += wordCount;
934 
935       if(!isMarkedClean)
936         evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
937       else
938         evaluateTwoDocs(markedDoc, cleanDoc, errDir);
939 
940     } else
941       evaluateTwoDocs(markedDoc, persDoc, errDir);
942 
943   }
944 
945   /**
946    * Count all Token.kind=word annotations in the document
947    */
948   protected int countWords(Document annotDoc) {
949     int count = 0;
950 
951     if (annotDoc == null) return 0;
952     // check for Token in outputSetName
953     AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
954     if (tokens == null) return 0;
955 
956     Iterator it = tokens.iterator();
957     Annotation currAnnotation;
958     while (it.hasNext()) {
959       currAnnotation = (Annotation) it.next();
960       Object feature = currAnnotation.getFeatures().get("kind");
961       if(feature != null && "word".equalsIgnoreCase((String)feature)) ++count;
962     } // while
963 
964     return count;
965   }
966 
967   protected void evaluateAllThree(Document persDoc,
968                                   Document cleanDoc, Document markedDoc,
969                                   File errDir)
970                                   throws ResourceInstantiationException {
971     //first start the table and its header
972     printTableHeader();
973 
974     // store annotation diff in .err file
975     FileWriter errFileWriter = null;
976     if (isMoreInfoMode && errDir != null) {
977       StringBuffer docName = new StringBuffer(cleanDoc.getName());
978       docName.replace(
979           cleanDoc.getName().lastIndexOf("."),
980           docName.length(),
981           ".err");
982       File errFile = new File(errDir, docName.toString());
983       try {
984         errFileWriter = new FileWriter(errFile, false);
985       }
986       catch (Exception ex) {
987         Out.prln("Exception when creating the error file " + errFile + ": "
988                  + ex.getMessage());
989         errFileWriter = null;
990       }
991     }
992 
993     for (int jj= 0; jj< annotTypes.size(); jj++) {
994       String annotType = (String) annotTypes.get(jj);
995 
996       AnnotationDiff annotDiff=measureDocs(markedDoc, cleanDoc, annotType);
997       //we don't have this annotation type in this document
998       if (annotDiff == null)
999         continue;
1000
1001      //increase the number of processed documents
1002      docNumber++;
1003      //add precison and recall to the sums
1004      updateStatistics(annotDiff, annotType);
1005
1006      AnnotationDiff annotDiff1 =
1007        measureDocs(markedDoc, persDoc, annotType);
1008
1009      Out.prln("<TR>");
1010
1011      if(isMoreInfoMode && annotDiff1 != null
1012         && (annotDiff1.getPrecisionAverage() != annotDiff.getPrecisionAverage()
1013         || annotDiff1.getRecallAverage() != annotDiff.getRecallAverage())
1014         )
1015        Out.prln("<TD> " + annotType + "_new"+ "</TD>");
1016      else
1017        Out.prln("<TD> " + annotType + "</TD>");
1018
1019      if (isMoreInfoMode) {
1020        if(annotDiff1 != null) updateStatisticsProc(annotDiff1, annotType);
1021
1022        Out.prln("<TD>" + annotDiff.getCorrectCount() + "</TD>");
1023        Out.prln("<TD>" + annotDiff.getPartiallyCorrectCount() + "</TD>");
1024        Out.prln("<TD>" + annotDiff.getMissingCount() + "</TD>");
1025        Out.prln("<TD>" + annotDiff.getSpuriousCount() + "</TD>");
1026      }
1027
1028      Out.prln("<TD>");
1029
1030      //check the precision first
1031      if (annotDiff1 != null) {
1032
1033        if (annotDiff1.getPrecisionAverage()
1034              < annotDiff.getPrecisionAverage()) {
1035            Out.prln("<P><Font color=blue> ");
1036            Out.prln(annotDiff.getPrecisionAverage());
1037
1038            if(!isMoreInfoMode) {
1039              Out.pr("<BR>Precision increase on human-marked from ");
1040              Out.pr(annotDiff1.getPrecisionAverage() + " to ");
1041              Out.prln(annotDiff.getPrecisionAverage());
1042            }
1043            Out.prln(" </Font></P>");
1044          }
1045        else if (annotDiff1.getPrecisionAverage()
1046               > annotDiff.getPrecisionAverage()) {
1047          Out.prln("<P><Font color=red> ");
1048          Out.prln(annotDiff.getPrecisionAverage());
1049
1050          if(!isMoreInfoMode) {
1051            Out.pr("<BR>Precision decrease on human-marked from ");
1052            Out.pr(annotDiff1.getPrecisionAverage() + " to ");
1053            Out.prln(annotDiff.getPrecisionAverage());
1054          }
1055          Out.prln(" </Font></P>");
1056        }
1057        else
1058          Out.prln("<P> " + annotDiff.getPrecisionAverage() + " </P>");
1059      }
1060      else
1061        Out.prln("<P> " + annotDiff.getPrecisionAverage() + " </P>");
1062
1063      Out.prln("</TD>");
1064
1065      Out.prln("<TD>");
1066
1067      //check the recall now
1068      if (annotDiff1 != null) {
1069
1070        if (annotDiff1.getRecallAverage() < annotDiff.getRecallAverage()) {
1071          Out.prln("<P><Font color=blue> ");
1072          Out.prln(annotDiff.getRecallAverage());
1073
1074          if(!isMoreInfoMode) {
1075            Out.pr("<BR>Recall increase on human-marked from ");
1076            Out.pr(annotDiff1.getRecallAverage() + " to ");
1077            Out.prln(annotDiff.getRecallAverage());
1078          }
1079          Out.prln(" </Font></P>");
1080        }
1081        else if (annotDiff1.getRecallAverage() > annotDiff.getRecallAverage()) {
1082          Out.prln("<P><Font color=red> ");
1083          Out.prln(annotDiff.getRecallAverage());
1084
1085          if(!isMoreInfoMode) {
1086            Out.pr("<BR>Recall decrease on human-marked from ");
1087            Out.pr(annotDiff1.getRecallAverage() + " to ");
1088            Out.prln(annotDiff.getRecallAverage());
1089          }
1090          Out.prln(" </Font></P>");
1091        }
1092        else
1093          Out.prln("<P> " + annotDiff.getRecallAverage() + " </P>");
1094      } else
1095        Out.prln("<P> " + annotDiff.getRecallAverage() + " </P>");
1096
1097
1098      Out.prln("</TD>");
1099
1100      //check the recall now
1101      if ( isVerboseMode ) {
1102        Out.prln("<TD>");
1103        if (annotDiff.getRecallAverage() < threshold) {
1104          printAnnotations(annotDiff, markedDoc, cleanDoc);
1105        }
1106        else {
1107          Out.prln("&nbsp;");
1108        }
1109        Out.prln("</TD>");
1110      }
1111
1112      Out.prln("</TR>");
1113
1114      // show one more table line for processed document
1115      if(isMoreInfoMode && annotDiff1 != null
1116         && (annotDiff1.getPrecisionAverage() != annotDiff.getPrecisionAverage()
1117         || annotDiff1.getRecallAverage() != annotDiff.getRecallAverage())
1118         ) {
1119
1120        Out.prln("<TR>");
1121        Out.prln("<TD> " + annotType + "_old" + "</TD>");
1122
1123        Out.prln("<TD>" + annotDiff1.getCorrectCount() + "</TD>");
1124        Out.prln("<TD>" + annotDiff1.getPartiallyCorrectCount() + "</TD>");
1125        Out.prln("<TD>" + annotDiff1.getMissingCount() + "</TD>");
1126        Out.prln("<TD>" + annotDiff1.getSpuriousCount() + "</TD>");
1127
1128        Out.prln("<TD>");
1129        if (annotDiff1.getPrecisionAverage() < annotDiff.getPrecisionAverage())
1130
1131          Out.prln("<P><Font color=blue> "  + annotDiff1.getPrecisionAverage()
1132                + "</Font></P>");
1133        else if (annotDiff1.getPrecisionAverage() > annotDiff.getPrecisionAverage())
1134          Out.prln(
1135             "<P><Font color=red> " + annotDiff1.getPrecisionAverage()
1136             + " </Font></P>");
1137        else
1138          Out.prln(annotDiff1.getPrecisionAverage());
1139
1140        Out.prln("</TD>");
1141
1142        Out.prln("<TD>");
1143        if (annotDiff1.getRecallAverage() < annotDiff.getRecallAverage())
1144          Out.prln("<P><Font color=blue> " + annotDiff1.getRecallAverage()
1145                   + " </Font></P>");
1146        else if (annotDiff1.getRecallAverage() > annotDiff.getRecallAverage())
1147          Out.prln("<P><Font color=red> " + annotDiff1.getRecallAverage()
1148                    + " </Font></P>");
1149        else
1150           Out.prln(annotDiff1.getRecallAverage());
1151
1152        Out.prln("</TD>");
1153
1154        //check the recall now
1155        if ( isVerboseMode ) {
1156          // create error file and start writing
1157
1158          Out.prln("<TD>");
1159          if (annotDiff.getRecallAverage() < threshold) {
1160            printAnnotations(annotDiff, markedDoc, cleanDoc);
1161          }
1162          else {
1163            Out.prln("&nbsp;");
1164          }
1165          Out.prln("</TD>");
1166        }
1167        Out.prln("</TR>");
1168      } // if(isMoreInfoMode && annotDiff1 != null)
1169
1170      if (isMoreInfoMode && errDir != null)
1171        storeAnnotations(annotType, annotDiff, markedDoc, cleanDoc, errFileWriter);
1172    }//for loop through annotation types
1173    Out.prln("</TABLE>");
1174
1175    try {
1176      if(errFileWriter != null)
1177        errFileWriter.close();
1178    }
1179    catch (Exception ex) {
1180      Out.prln("Exception on close of error file " + errFileWriter + ": "
1181               + ex.getMessage());
1182    }
1183  }//evaluateAllThree
1184
1185  protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1186                                 File errDir)
1187        throws ResourceInstantiationException {
1188
1189    //first start the table and its header
1190    printTableHeader();
1191
1192    // store annotation diff in .err file
1193    FileWriter errFileWriter = null;
1194    if (isMoreInfoMode && errDir != null) {
1195      StringBuffer docName = new StringBuffer(keyDoc.getName());
1196      docName.replace(
1197          keyDoc.getName().lastIndexOf("."),
1198          docName.length(),
1199          ".err");
1200      File errFile = new File(errDir, docName.toString());
1201      try {
1202        errFileWriter = new FileWriter(errFile, false);
1203      }
1204      catch (Exception ex) {
1205        Out.prln("Exception when creating the error file " + errFile + ": "
1206                 + ex.getMessage());
1207        errFileWriter = null;
1208      }
1209    }
1210
1211    for (int jj= 0; jj< annotTypes.size(); jj++) {
1212      String annotType = (String) annotTypes.get(jj);
1213
1214      AnnotationDiff annotDiff=measureDocs(keyDoc, respDoc, annotType);
1215      //we don't have this annotation type in this document
1216      if (annotDiff == null)
1217        continue;
1218
1219      //increase the number of processed documents
1220      docNumber++;
1221      //add precison and recall to the sums
1222      updateStatistics(annotDiff, annotType);
1223
1224      Out.prln("<TR>");
1225      Out.prln("<TD>" + annotType + "</TD>");
1226
1227      if(isMoreInfoMode) {
1228        Out.prln("<TD>" + annotDiff.getCorrectCount() + "</TD>");
1229        Out.prln("<TD>" + annotDiff.getPartiallyCorrectCount() + "</TD>");
1230        Out.prln("<TD>" + annotDiff.getMissingCount() + "</TD>");
1231        Out.prln("<TD>" + annotDiff.getSpuriousCount() + "</TD>");
1232      }
1233
1234      Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
1235      Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
1236      //check the recall now
1237      if ( isVerboseMode ) {
1238        Out.prln("<TD>");
1239        if (annotDiff.getRecallAverage() < threshold) {
1240          printAnnotations(annotDiff, keyDoc, respDoc);
1241        }
1242        else {
1243          Out.prln("&nbsp;");
1244        }
1245        Out.prln("</TD>");
1246      }
1247      Out.prln("</TR>");
1248
1249      if (isMoreInfoMode && errDir != null)
1250        storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errFileWriter);
1251    }//for loop through annotation types
1252    Out.prln("</TABLE>");
1253
1254    try {
1255      if(errFileWriter != null)
1256        errFileWriter.close();
1257    }
1258    catch (Exception ex) {
1259      Out.prln("Exception on close of error file " + errFileWriter + ": "
1260               + ex.getMessage());
1261    }
1262  }//evaluateTwoDocs
1263
1264  protected void printTableHeader() {
1265    Out.prln("<TABLE BORDER=1");
1266    Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1267
1268    if(isMoreInfoMode)
1269     Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1270             + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1271
1272    Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1273
1274    if (isVerboseMode)
1275      Out.pr("<TD><B>Annotations</B></TD>");
1276
1277    Out.prln("</TR>");
1278  }
1279
1280  protected void updateStatistics(AnnotationDiff annotDiff, String annotType){
1281      precisionSum += annotDiff.getPrecisionAverage();
1282      recallSum += annotDiff.getRecallAverage();
1283      fMeasureSum += annotDiff.getFMeasureAverage();
1284      Double oldPrecision = (Double) precisionByType.get(annotType);
1285      if (oldPrecision == null)
1286        precisionByType.put(annotType,
1287                            new Double(annotDiff.getPrecisionAverage()));
1288      else
1289        precisionByType.put(annotType,
1290                            new Double(oldPrecision.doubleValue() +
1291                                       annotDiff.getPrecisionAverage()));
1292      Integer precCount = (Integer) prCountByType.get(annotType);
1293      if (precCount == null)
1294        prCountByType.put(annotType, new Integer(1));
1295      else
1296        prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1297
1298
1299      Double oldFMeasure = (Double) fMeasureByType.get(annotType);
1300      if (oldFMeasure == null)
1301        fMeasureByType.put(annotType,
1302                         new Double(annotDiff.getFMeasureAverage()));
1303      else
1304        fMeasureByType.put(annotType,
1305                         new Double(oldFMeasure.doubleValue() +
1306                                    annotDiff.getFMeasureAverage()));
1307      Integer fCount = (Integer) fMeasureCountByType.get(annotType);
1308      if (fCount == null)
1309        fMeasureCountByType.put(annotType, new Integer(1));
1310      else
1311        fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1312
1313              Double oldRecall = (Double) recallByType.get(annotType);
1314      if (oldRecall == null)
1315        recallByType.put(annotType,
1316                            new Double(annotDiff.getRecallAverage()));
1317      else
1318        recallByType.put(annotType,
1319                            new Double(oldRecall.doubleValue() +
1320                                       annotDiff.getRecallAverage()));
1321      Integer recCount = (Integer) recCountByType.get(annotType);
1322      if (recCount == null)
1323        recCountByType.put(annotType, new Integer(1));
1324      else
1325        recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1326
1327      //Update the missing, spurious, correct, and partial counts
1328      Long oldMissingNo = (Long) missingByType.get(annotType);
1329      if (oldMissingNo == null)
1330        missingByType.put(annotType, new Long(annotDiff.getMissingCount()));
1331      else
1332        missingByType.put(annotType,
1333                        new Long(oldMissingNo.longValue() +
1334                                  annotDiff.getMissingCount()));
1335
1336      Long oldCorrectNo = (Long) correctByType.get(annotType);
1337      if (oldCorrectNo == null)
1338        correctByType.put(annotType, new Long(annotDiff.getCorrectCount()));
1339      else
1340        correctByType.put(annotType,
1341                        new Long(oldCorrectNo.longValue() +
1342                                  annotDiff.getCorrectCount()));
1343
1344      Long oldPartialNo = (Long) partialByType.get(annotType);
1345      if (oldPartialNo == null)
1346        partialByType.put(annotType, new Long(annotDiff.getPartiallyCorrectCount()));
1347      else
1348        partialByType.put(annotType,
1349                        new Long(oldPartialNo.longValue() +
1350                                  annotDiff.getPartiallyCorrectCount()));
1351
1352      Long oldSpuriousNo = (Long) spurByType.get(annotType);
1353      if (oldSpuriousNo == null)
1354        spurByType.put(annotType, new Long(annotDiff.getSpuriousCount()));
1355      else
1356        spurByType.put(annotType,
1357                        new Long(oldSpuriousNo.longValue() +
1358                                  annotDiff.getSpuriousCount()));
1359  }
1360
1361  /**
1362   * Update statistics for processed documents
1363   * The same procedure as updateStatistics with different hashTables
1364   */
1365  protected void updateStatisticsProc(AnnotationDiff annotDiff, String annotType){
1366    hasProcessed = true;
1367      proc_precisionSum += annotDiff.getPrecisionAverage();
1368      proc_recallSum += annotDiff.getRecallAverage();
1369      proc_fMeasureSum += annotDiff.getFMeasureAverage();
1370      Double oldPrecision = (Double) proc_precisionByType.get(annotType);
1371      if (oldPrecision == null)
1372        proc_precisionByType.put(annotType,
1373                            new Double(annotDiff.getPrecisionAverage()));
1374      else
1375        proc_precisionByType.put(annotType,
1376                            new Double(oldPrecision.doubleValue() +
1377                                       annotDiff.getPrecisionAverage()));
1378      Integer precCount = (Integer) proc_prCountByType.get(annotType);
1379      if (precCount == null)
1380        proc_prCountByType.put(annotType, new Integer(1));
1381      else
1382        proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1383
1384
1385      Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType);
1386      if (oldFMeasure == null)
1387        proc_fMeasureByType.put(annotType,
1388                         new Double(annotDiff.getFMeasureAverage()));
1389      else
1390        proc_fMeasureByType.put(annotType,
1391                         new Double(oldFMeasure.doubleValue() +
1392                                    annotDiff.getFMeasureAverage()));
1393      Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType);
1394      if (fCount == null)
1395        proc_fMeasureCountByType.put(annotType, new Integer(1));
1396      else
1397        proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1398
1399      Double oldRecall = (Double) proc_recallByType.get(annotType);
1400      if (oldRecall == null)
1401        proc_recallByType.put(annotType,
1402                            new Double(annotDiff.getRecallAverage()));
1403      else
1404        proc_recallByType.put(annotType,
1405                            new Double(oldRecall.doubleValue() +
1406                                       annotDiff.getRecallAverage()));
1407      Integer recCount = (Integer) proc_recCountByType.get(annotType);
1408      if (recCount == null)
1409        proc_recCountByType.put(annotType, new Integer(1));
1410      else
1411        proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1412
1413      //Update the missing, spurious, correct, and partial counts
1414      Long oldMissingNo = (Long) proc_missingByType.get(annotType);
1415      if (oldMissingNo == null)
1416        proc_missingByType.put(annotType, new Long(annotDiff.getMissingCount()));
1417      else
1418        proc_missingByType.put(annotType,
1419                        new Long(oldMissingNo.longValue() +
1420                                  annotDiff.getMissingCount()));
1421
1422      Long oldCorrectNo = (Long) proc_correctByType.get(annotType);
1423      if (oldCorrectNo == null)
1424        proc_correctByType.put(annotType, new Long(annotDiff.getCorrectCount()));
1425      else
1426        proc_correctByType.put(annotType,
1427                        new Long(oldCorrectNo.longValue() +
1428                                  annotDiff.getCorrectCount()));
1429
1430      Long oldPartialNo = (Long) proc_partialByType.get(annotType);
1431      if (oldPartialNo == null)
1432        proc_partialByType.put(annotType, new Long(annotDiff.getPartiallyCorrectCount()));
1433      else
1434        proc_partialByType.put(annotType,
1435                        new Long(oldPartialNo.longValue() +
1436                                  annotDiff.getPartiallyCorrectCount()));
1437
1438      Long oldSpuriousNo = (Long) proc_spurByType.get(annotType);
1439      if (oldSpuriousNo == null)
1440        proc_spurByType.put(annotType, new Long(annotDiff.getSpuriousCount()));
1441      else
1442        proc_spurByType.put(annotType,
1443                        new Long(oldSpuriousNo.longValue() +
1444                                  annotDiff.getSpuriousCount()));
1445  }
1446
1447  public void printStatistics() {
1448
1449    Out.prln("<H2> Statistics </H2>");
1450
1451/*
1452    Out.prln("<H3> Precision </H3>");
1453    if (precisionByType != null && !precisionByType.isEmpty()) {
1454      Iterator iter = precisionByType.keySet().iterator();
1455      while (iter.hasNext()) {
1456        String annotType = (String) iter.next();
1457        Out.prln(annotType + ": "
1458          + ((Double)precisionByType.get(annotType)).doubleValue()
1459              /
1460              ((Integer)prCountByType.get(annotType)).intValue()
1461          + "<P>");
1462      }//while
1463    }
1464    Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
1465
1466    Out.prln("<H3> Recall </H3>");
1467    if (recallByType != null && !recallByType.isEmpty()) {
1468      Iterator iter = recallByType.keySet().iterator();
1469      while (iter.hasNext()) {
1470        String annotType = (String) iter.next();
1471        Out.prln(annotType + ": "
1472          + ((Double)recallByType.get(annotType)).doubleValue()
1473              /
1474              ((Integer)recCountByType.get(annotType)).intValue()
1475          + "<P>");
1476      }//while
1477    }
1478
1479    Out.prln("Overall recall: " + getRecallAverage()
1480             + "<P>");
1481*/
1482    if (annotTypes == null) {
1483      Out.prln("No types given for evaluation, cannot obtain precision/recall");
1484      return;
1485    }
1486    Out.prln("<table border=1>");
1487    Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1488              "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1489              "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1490              "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1491    String annotType;
1492    for (int i = 0; i < annotTypes.size(); i++) {
1493      annotType = (String) annotTypes.get(i);
1494      printStatsForType(annotType);
1495    }//for
1496    Out.prln("</table>");
1497  } // updateStatisticsProc
1498
1499  protected void printStatsForType(String annotType){
1500    long correct = (correctByType.get(annotType) == null)? 0 :
1501                      ((Long)correctByType.get(annotType)).longValue();
1502    long partial = (partialByType.get(annotType) == null)? 0 :
1503                      ((Long)partialByType.get(annotType)).longValue();
1504    long spurious = (spurByType.get(annotType) == null)? 0 :
1505                      ((Long)spurByType.get(annotType)).longValue();
1506    long missing = (missingByType.get(annotType) == null)? 0:
1507                      ((Long)missingByType.get(annotType)).longValue();
1508    long actual = correct + partial + spurious;
1509    long possible = correct + partial + missing;
1510    //precision strict is correct/actual
1511    //precision is (correct + 0.5 * partially correct)/actual
1512    double precision = (correct + 0.5 * partial) / actual;
1513    //recall strict is correct/possible
1514    double recall = (correct + 0.5*partial)/possible;
1515    //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1516    double fmeasure =
1517      ((beta*beta + 1)*precision*recall)
1518      /
1519      ((beta*beta*precision) + recall);
1520
1521    long proc_correct=0;
1522    long proc_partial=0;
1523    long proc_spurious=0;
1524    long proc_missing=0;
1525    long proc_actual=0;
1526    long proc_possible=0;
1527    double proc_precision=0;
1528    double proc_recall=0;
1529    double proc_fmeasure=0;
1530
1531    if(hasProcessed) {
1532      // calculate values for processed
1533      proc_correct = (proc_correctByType.get(annotType) == null)? 0 :
1534                        ((Long)proc_correctByType.get(annotType)).longValue();
1535      proc_partial = (proc_partialByType.get(annotType) == null)? 0 :
1536                        ((Long)proc_partialByType.get(annotType)).longValue();
1537      proc_spurious = (proc_spurByType.get(annotType) == null)? 0 :
1538                        ((Long)proc_spurByType.get(annotType)).longValue();
1539      proc_missing = (proc_missingByType.get(annotType) == null)? 0:
1540                        ((Long)proc_missingByType.get(annotType)).longValue();
1541      proc_actual = proc_correct + proc_partial + proc_spurious;
1542      proc_possible = proc_correct + proc_partial + proc_missing;
1543      //precision strict is correct/actual
1544      //precision is (correct + 0.5 * partially correct)/actual
1545      proc_precision = (proc_correct + 0.5*proc_partial)/proc_actual;
1546      //recall strict is correct/possible
1547      proc_recall = (proc_correct + 0.5*proc_partial)/proc_possible;
1548      //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1549      proc_fmeasure =
1550        ((beta*beta + 1)*proc_precision*proc_recall)
1551        /
1552        ((beta*beta*proc_precision) + proc_recall);
1553    }
1554
1555    // output data
1556    Out.prln("<TR>");
1557    if(hasProcessed)
1558      Out.prln("<TD>" + annotType+ "_new"  + "</TD>");
1559    else
1560      Out.prln("<TD>" + annotType + "</TD>");
1561
1562    Out.prln("<TD>" + correct + "</TD>");
1563    Out.prln("<TD>" + partial + "</TD>");
1564    Out.prln("<TD>" + missing + "</TD>");
1565    Out.prln("<TD>" + spurious + "</TD>");
1566
1567    if(hasProcessed && (precision < proc_precision))
1568      Out.prln("<TD><Font color=red>" + precision + "</TD>");
1569      else if(hasProcessed && (precision > proc_precision))
1570        Out.prln("<TD><Font color=blue>" + precision + "</TD>");
1571        else
1572          Out.prln("<TD>" + precision + "</TD>");
1573    if(hasProcessed && (recall < proc_recall))
1574      Out.prln("<TD><Font color=red>" + recall + "</TD>");
1575      else if(hasProcessed && (recall > proc_recall))
1576        Out.prln("<TD><Font color=blue>" + recall + "</TD>");
1577        else
1578          Out.prln("<TD>" + recall + "</TD>");
1579    Out.prln("<TD>" + fmeasure + "</TD>");
1580    Out.prln("</TR>");
1581
1582    if(hasProcessed) {
1583      // output data
1584      Out.prln("<TR>");
1585      Out.prln("<TD>" + annotType + "_old" + "</TD>");
1586
1587      Out.prln("<TD>" + proc_correct + "</TD>");
1588      Out.prln("<TD>" + proc_partial + "</TD>");
1589      Out.prln("<TD>" + proc_missing + "</TD>");
1590      Out.prln("<TD>" + proc_spurious + "</TD>");
1591
1592      if(precision < proc_precision)
1593        Out.prln("<TD><Font color=red>" + proc_precision + "</TD>");
1594        else if(precision > proc_precision)
1595          Out.prln("<TD><Font color=blue>" + proc_precision + "</TD>");
1596          else
1597            Out.prln("<TD>" + proc_precision + "</TD>");
1598      if(recall < proc_recall)
1599        Out.prln("<TD><Font color=red>" + proc_recall + "</TD>");
1600        else if(recall > proc_recall)
1601          Out.prln("<TD><Font color=blue>" + proc_recall + "</TD>");
1602          else
1603            Out.prln("<TD>" + proc_recall + "</TD>");
1604      Out.prln("<TD>" + proc_fmeasure + "</TD>");
1605      Out.prln("</TR>");
1606    }
1607  }//printStatsForType
1608
1609  protected AnnotationDiff measureDocs(
1610    Document keyDoc, Document respDoc, String annotType)
1611      throws ResourceInstantiationException {
1612
1613    if (keyDoc == null || respDoc == null)
1614      return null;
1615
1616    if (annotSetName != null
1617        && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1618      return null;
1619    else if ((annotSetName == null || annotSetName.equals(""))
1620        && keyDoc.getAnnotations().get(annotType) == null)
1621      return null;
1622
1623    // create the annotation schema needed for AnnotationDiff
1624    AnnotationSchema annotationSchema = new AnnotationSchema();
1625
1626    // set annotation type
1627    annotationSchema.setAnnotationName(annotType);
1628    // create an annotation diff
1629    AnnotationDiff annotDiff = new AnnotationDiff();
1630    annotDiff.setTextMode(new Boolean(true));
1631    annotDiff.setAnnotationSchema(annotationSchema);
1632    annotDiff.setKeyDocument(keyDoc);
1633    annotDiff.setResponseDocument(respDoc);
1634    annotDiff.setKeyAnnotationSetName(annotSetName);
1635    annotDiff.setResponseAnnotationSetName(outputSetName);
1636    // set feature names set for annotation diff
1637    annotDiff.setKeyFeatureNamesSet(diffFeaturesSet);
1638    annotDiff.init();
1639
1640    return annotDiff;
1641  } // measureDocs
1642
1643  protected void storeAnnotations(String type, AnnotationDiff annotDiff,
1644                  Document keyDoc, Document respDoc, FileWriter errFileWriter) {
1645    if(errFileWriter == null) return; // exit on "no file"
1646
1647    try {
1648      // extract and store annotations
1649      Comparator comp = new OffsetComparator();
1650      TreeSet sortedSet = new TreeSet(comp);
1651      Set missingSet =
1652          annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE);
1653      sortedSet.clear();
1654      sortedSet.addAll(missingSet);
1655      storeAnnotations(type+".miss", sortedSet, keyDoc, errFileWriter);
1656      Set spuriousSet =
1657          annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE);
1658      sortedSet.clear();
1659      sortedSet.addAll(spuriousSet);
1660      storeAnnotations(type+".spur", sortedSet, respDoc, errFileWriter);
1661      Set partialSet =
1662          annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE);
1663      sortedSet.clear();
1664      sortedSet.addAll(partialSet);
1665      storeAnnotations(type+".part", sortedSet, respDoc, errFileWriter);
1666    } catch (Exception ex) {
1667      Out.prln("Exception on close of error file "+errFileWriter+": "
1668               +ex.getMessage());
1669    }
1670  }// storeAnnotations
1671
1672  protected void storeAnnotations(String type, Set set, Document doc,
1673                                  FileWriter file) throws IOException{
1674
1675    if (set == null || set.isEmpty())
1676      return;
1677
1678    Iterator iter = set.iterator();
1679    Annotation ann;
1680    while (iter.hasNext()) {
1681      ann = (Annotation) iter.next();
1682      file.write(type);
1683      file.write(".");
1684      file.write(doc.getContent().toString().substring(
1685          ann.getStartNode().getOffset().intValue(),
1686          ann.getEndNode().getOffset().intValue()));
1687      file.write(".");
1688      file.write(ann.getStartNode().getOffset().toString());
1689      file.write(".");
1690      file.write(ann.getEndNode().getOffset().toString());
1691      file.write("\n");
1692    }//while
1693  }// storeAnnotations
1694
1695  protected void printAnnotations(AnnotationDiff annotDiff,
1696                    Document keyDoc, Document respDoc) {
1697    Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
1698    Set missingSet =
1699      annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE);
1700    printAnnotations(missingSet, keyDoc);
1701    Out.prln("<BR>");
1702
1703    Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
1704    Set spuriousSet =
1705      annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE);
1706    printAnnotations(spuriousSet, respDoc);
1707    Out.prln("</BR>");
1708
1709    Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
1710    Set partialSet =
1711      annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE);
1712    printAnnotations(partialSet, respDoc);
1713  }
1714
1715  protected void printAnnotations(Set set, Document doc) {
1716    if (set == null || set.isEmpty())
1717      return;
1718
1719    Iterator iter = set.iterator();
1720    while (iter.hasNext()) {
1721      Annotation ann = (Annotation) iter.next();
1722      Out.prln(
1723        "<B>" +
1724        doc.getContent().toString().substring(
1725          ann.getStartNode().getOffset().intValue(),
1726          ann.getEndNode().getOffset().intValue()) +
1727        "</B>: <I>[" + ann.getStartNode().getOffset() +
1728        "," + ann.getEndNode().getOffset() + "]</I>"
1729//        + "; features" + ann.getFeatures()
1730        );
1731    }//while
1732  }//printAnnotations
1733
1734  /**
1735   * The directory from which we should generate/evaluate the corpus
1736   */
1737  private File startDir;
1738  private File currDir;
1739  private static List annotTypes;
1740
1741  private Controller application = null;
1742  private File applicationFile = null;
1743
1744  //collect the sum of all precisions and recalls of all docs
1745  //and the number of docs, so I can calculate the average for
1746  //the corpus at the end
1747  private double precisionSum = 0;
1748  private double recallSum = 0;
1749  private double fMeasureSum = 0;
1750  private HashMap precisionByType = new HashMap();
1751  private HashMap prCountByType = new HashMap();
1752  private HashMap recallByType = new HashMap();
1753  private HashMap recCountByType = new HashMap();
1754  private HashMap fMeasureByType = new HashMap();
1755  private HashMap fMeasureCountByType = new HashMap();
1756
1757  private HashMap missingByType = new HashMap();
1758  private HashMap spurByType = new HashMap();
1759  private HashMap correctByType = new HashMap();
1760  private HashMap partialByType = new HashMap();
1761
1762  // statistic for processed
1763  static boolean hasProcessed = false;
1764  private double proc_precisionSum = 0;
1765  private double proc_recallSum = 0;
1766  private double proc_fMeasureSum = 0;
1767  private HashMap proc_precisionByType = new HashMap();
1768  private HashMap proc_prCountByType = new HashMap();
1769  private HashMap proc_recallByType = new HashMap();
1770  private HashMap proc_recCountByType = new HashMap();
1771  private HashMap proc_fMeasureByType = new HashMap();
1772  private HashMap proc_fMeasureCountByType = new HashMap();
1773
1774  private HashMap proc_missingByType = new HashMap();
1775  private HashMap proc_spurByType = new HashMap();
1776  private HashMap proc_correctByType = new HashMap();
1777  private HashMap proc_partialByType = new HashMap();
1778
1779  double beta = 1;
1780
1781  private int docNumber = 0;
1782
1783  /**
1784   * If true, the corpus tool will generate the corpus, otherwise it'll
1785   * run in evaluate mode
1786   */
1787  private boolean isGenerateMode = false;
1788
1789  /**
1790   * If true - show annotations for docs below threshold
1791   */
1792  private boolean isVerboseMode = false;
1793
1794  /**
1795   * If true - show more info in document table
1796   */
1797  private boolean isMoreInfoMode = false;
1798
1799  /**
1800   * The list of features used in the AnnotationDiff separated by comma
1801   * Example: "class;inst"
1802   */
1803  private Set diffFeaturesSet;
1804
1805  /**
1806   * If true, the corpus tool will evaluate stored against the human-marked
1807   * documents
1808   */
1809  private boolean isMarkedStored = false;
1810  private boolean isMarkedClean = false;
1811  //whether marked are in a DS, not xml
1812  private boolean isMarkedDS = false;
1813
1814  private String annotSetName = "Key";
1815  private String outputSetName = null;
1816
1817  private double threshold = 0.5;
1818  private Properties configs = new Properties();
1819  private static int corpusWordCount = 0;
1820
1821  private String documentEncoding = "";
1822
1823  /** String to print when wrong command-line args */
1824  private static String usage =
1825    "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
1826    +"[-verbose] [-moreinfo] directory-name application";
1827
1828}