1   /*
2    *  CorpusBenchmarkTool.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/Oct/2001
12   *
13   *  $Id: CorpusBenchmarkTool.java,v 1.51 2004/11/08 15:00:55 valyt Exp $
14   */
15  
16  package gate.util;
17  
18  import java.io.*;
19  import java.util.*;
20  
21  import gate.*;
22  import gate.util.AnnotationDiffer;
23  import gate.creole.*;
24  import gate.persist.PersistenceException;
25  import gate.persist.SerialDataStore;
26  
27  public class CorpusBenchmarkTool {
28    private static final String MARKED_DIR_NAME = "marked";
29    private static final String CLEAN_DIR_NAME = "clean";
30    private static final String CVS_DIR_NAME = "Cvs";
31    private static final String PROCESSED_DIR_NAME = "processed";
32    private static final String ERROR_DIR_NAME = "err";
33  
34    private static final boolean DEBUG = true;
35  
36    public CorpusBenchmarkTool() {}
37  
38    public void initPRs() {
39      try {
40        if (applicationFile == null)
41          Out.prln("Application not set!");
42        Out.prln("App file is: " + applicationFile.getAbsolutePath());
43        application = (Controller) gate.util.persistence.PersistenceManager
44                                     .loadObjectFromFile(applicationFile);
45      } catch (Exception ex) {
46        throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
47      }
48    }//initPRs
49  
50    public void unloadPRs() {
51      //we have nothing to unload if no PRs are loaded
52      if (isMarkedStored)
53        return;
54  
55    }
56  
57    public void execute() {
58  /*
59      Out.prln("Flags Gen Cln Str Vrb Minf: "
60               + isGenerateMode +" "+ isMarkedClean +" "+ isMarkedStored
61               +" "+ isVerboseMode +" "+ isMoreInfoMode);
62  */
63      execute(startDir);
64  /*    if (application != null) {
65        Iterator iter = new ArrayList(application.getPRs()).iterator();
66        while (iter.hasNext())
67          Factory.deleteResource((Resource) iter.next());
68  
69        Factory.deleteResource(application);
70      }*/
71    }
72  
73    public void init() {
74      //first read the corpus_tool.properties file
75      File propFile = new File("corpus_tool.properties");
76      Out.prln(propFile.getAbsolutePath());
77      if (propFile.exists()) {
78        try {
79          InputStream inputStream = new FileInputStream(propFile);
80          this.configs.load(inputStream);
81          String thresholdString = this.configs.getProperty("threshold");
82          if (thresholdString != null && !thresholdString.equals("")) {
83            this.threshold = (new Double(thresholdString)).doubleValue();
84            Out.prln("New threshold is: " + this.threshold + "<P>\n");
85          }
86          String setName = this.configs.getProperty("annotSetName");
87          if (setName != null && !setName.equals("")) {
88            Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
89            this.annotSetName = setName;
90          }
91          setName = this.configs.getProperty("outputSetName");
92          if (setName != null && !setName.equals("")) {
93            Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
94            this.outputSetName = setName;
95          }
96          String encodingString = this.configs.getProperty("encoding");
97          if (encodingString != null && !encodingString.equals("")) {
98            this.documentEncoding = encodingString;
99            Out.prln("New encoding is: " + this.documentEncoding + "<P>\n");
100         }
101         String types = this.configs.getProperty("annotTypes");
102         if (types != null && !types.equals("")) {
103           Out.prln("Using annotation types from the properties file. <P>\n");
104           StringTokenizer strTok = new StringTokenizer(types, ";");
105           annotTypes = new ArrayList();
106           while (strTok.hasMoreTokens())
107             annotTypes.add(strTok.nextToken());
108         } else {
109           annotTypes = new ArrayList();
110           annotTypes.add("Organization");
111           annotTypes.add("Person");
112           annotTypes.add("Date");
113           annotTypes.add("Location");
114           annotTypes.add("Address");
115           annotTypes.add("Money");
116           annotTypes.add("Percent");
117           annotTypes.add("GPE");
118           annotTypes.add("Facility");
119         }
120         String features = this.configs.getProperty("annotFeatures");
121         HashSet result = new HashSet();
122         if (features != null && !features.equals("")) {
123           Out.pr("Using annotation features from the properties file. \n");
124           java.util.StringTokenizer tok =
125               new java.util.StringTokenizer(features, ";");
126           String current;
127           while(tok.hasMoreTokens()) {
128             current = tok.nextToken();
129             result.add(current);
130           } // while
131         }
132         diffFeaturesSet = result;
133         Out.prln("Features: "+diffFeaturesSet+" <P>\n");
134 
135       } catch (IOException ex) {
136         //just ignore the file and go on with the defaults
137         this.configs = new Properties();
138       }
139     } else
140       this.configs = new Properties();
141 
142 
143     //we only initialise the PRs if they are going to be used
144     //for processing unprocessed documents
145     if (!this.isMarkedStored)
146       initPRs();
147 
148   }
149 
150   public void execute(File dir) {
151     if (dir == null)
152       return;
153     //first set the current directory to be the given one
154     currDir = dir;
155 
156     File processedDir = null;
157     File cleanDir = null;
158     File markedDir = null;
159     File errorDir = null;
160 
161     ArrayList subDirs = new ArrayList();
162     File[] dirArray = currDir.listFiles();
163     if(dirArray == null) return;
164     for (int i = 0; i < dirArray.length; i++) {
165       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
166         continue;
167       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
168         cleanDir = dirArray[i];
169       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
170         markedDir = dirArray[i];
171       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
172         processedDir = dirArray[i];
173       else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
174         errorDir = dirArray[i];
175       else
176         subDirs.add(dirArray[i]);
177     }
178 
179     if(cleanDir == null) return;
180     Out.prln("Processing directory: " + currDir + "<P>");
181 
182     if (this.isGenerateMode)
183       generateCorpus(cleanDir, processedDir);
184     else
185       evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
186 
187     //if no more subdirs left, return
188     if (subDirs.isEmpty())
189       return;
190 
191     //there are more subdirectories to traverse, so iterate through
192     for (int j = 0; j < subDirs.size(); j++)
193       execute((File) subDirs.get(j));
194 
195   }//execute(dir)
196 
197 
198   public static void main(String[] args) throws GateException {
199     Out.prln("<HTML>");
200     Out.prln("<HEAD>");
201     Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
202     for(int argC=0; argC < args.length; ++argC)
203       Out.pr(args[argC]+" ");
204     Out.pr(" on " + new Date() + "</TITLE> </HEAD>");
205     Out.prln("<BODY>");
206     Out.prln("Please wait while GATE tools are initialised. <P>");
207     // initialise GATE
208     Gate.init();
209 
210     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
211 
212     List inputFiles = null;
213     if(args.length < 1) throw new GateException(usage);
214     int i = 0;
215     while (i < args.length && args[i].startsWith("-")) {
216       if(args[i].equals("-generate")) {
217         Out.prln("Generating the corpus... <P>");
218         corpusTool.setGenerateMode(true);
219       } else if (args[i].equals("-marked_clean")) {
220         Out.prln("Evaluating current grammars against human-annotated...<P>");
221         corpusTool.setMarkedClean(true);
222       } else if (args[i].equals("-marked_stored")) {
223         Out.prln("Evaluating stored documents against human-annotated...<P>");
224         corpusTool.setMarkedStored(true);
225       } else if (args[i].equals("-marked_ds")) {
226         Out.prln("Looking for marked docs in a datastore...<P>");
227         corpusTool.setMarkedDS(true);
228       } else if (args[i].equals("-verbose")) {
229         Out.prln("Running in verbose mode. Will generate annotation " +
230           "information when precision/recall are lower than " +
231           corpusTool.getThreshold() +"<P>");
232         corpusTool.setVerboseMode(true);
233       } else if (args[i].equals("-moreinfo")) {
234         Out.prln("Show more details in document table...<P>");
235         corpusTool.setMoreInfo(true);
236       }
237       i++; //just ignore the option, which we do not recognise
238     }//while
239 
240     String dirName = args[i];
241     File dir = new File(dirName);
242     if (!dir.isDirectory())
243       throw new GateException(usage);
244 
245     //get the last argument which is the application
246     i++;
247     String appName = args[i];
248     File appFile = new File(appName);
249     if (!appFile.isFile())
250       throw new GateException(usage);
251     else
252       corpusTool.setApplicationFile(appFile);
253 
254     corpusTool.init();
255     corpusWordCount = 0;
256 
257     Out.prln("Measuring annotaitions of types: " + CorpusBenchmarkTool.annotTypes + "<P>");
258 
259     corpusTool.setStartDirectory(dir);
260     corpusTool.execute();
261     //if we're not generating the corpus, then print the precision and recall
262     //statistics for the processed corpus
263     if (! corpusTool.getGenerateMode())
264       corpusTool.printStatistics();
265 
266     Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
267     Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
268     if(corpusWordCount == 0)
269       Out.prln("<BR>No Token annotations to count words in the corpus.");
270     else
271       Out.prln("<BR>Overall word count: " + corpusWordCount);
272 
273 
274     if(hasProcessed) {
275       Out.prln("<P>Old Processed: ");
276       Out.prln("<BR>Overall average precision: "
277                + corpusTool.getPrecisionAverageProc());
278       Out.prln("<BR>Overall average recall: "
279                + corpusTool.getRecallAverageProc());
280     }
281     Out.prln("<BR>Finished! <P>");
282     Out.prln("</BODY>");
283     Out.prln("</HTML>");
284 
285     System.exit(0);
286 
287   }//main
288 
289   public void setGenerateMode(boolean mode) {
290     isGenerateMode = mode;
291   }//setGenerateMode
292 
293   public boolean getGenerateMode() {
294     return isGenerateMode;
295   }//getGenerateMode
296 
297   public boolean getVerboseMode() {
298     return isVerboseMode;
299   }//getVerboseMode
300 
301   public void setVerboseMode(boolean mode) {
302     isVerboseMode = mode;
303   }//setVerboseMode
304 
305   public void setMoreInfo(boolean mode) {
306     isMoreInfoMode = mode;
307   } // setMoreInfo
308 
309   public boolean getMoreInfo() {
310     return isMoreInfoMode;
311   } // getMoreInfo
312 
313   public void setDiffFeaturesList(Set features) {
314     diffFeaturesSet = features;
315   } // setDiffFeaturesList
316 
317   public Set getDiffFeaturesList() {
318     return diffFeaturesSet;
319   } // getDiffFeaturesList
320 
321   public void setMarkedStored(boolean mode) {
322     isMarkedStored = mode;
323   }// setMarkedStored
324 
325 
326   public boolean getMarkedStored() {
327     return isMarkedStored;
328   }// getMarkedStored
329 
330   public void setMarkedClean(boolean mode) {
331     isMarkedClean = mode;
332   }//
333 
334   public boolean getMarkedClean() {
335     return isMarkedClean;
336   }//
337 
338   public void setMarkedDS(boolean mode) {
339     isMarkedDS = mode;
340   }//
341 
342   public boolean getMarkedDS() {
343     return isMarkedDS;
344   }//
345 
346   public void setApplicationFile(File newAppFile) {
347     applicationFile = newAppFile;
348   }
349 
350   /**
351    * Returns the average precision over the entire set of processed documents.
352    * <P>
353    * If the tool has been evaluating the original documents against the
354    * previously-stored automatically annotated ones, then the precision
355    * will be the average precision on those two sets. <P>
356    * If the tool was run in -marked mode, i.e., was evaluating the stored
357    * automatically processed ones against the human-annotated ones, then
358    * the precision will be the average precision on those two sets of documents.
359    */
360   public double getPrecisionAverage() {
361     return (double)precisionSum/docNumber;
362   }
363 
364   /**
365    * Returns the average recall over the entire set of processed documents.
366    * <P>
367    * If the tool has been evaluating the original documents against the
368    * previously-stored automatically annotated ones, then the recall
369    * will be the average recall on those two sets. <P>
370    * If the tool was run in -marked mode, i.e., was evaluating the stored
371    * automatically processed ones against the human-annotated ones, then
372    * the recall will be the average recall on those two sets of documents.
373    */
374   public double getRecallAverage() {
375     return (double)recallSum/docNumber;
376   }
377 
378   /** For processed documents */
379   public double getPrecisionAverageProc() {
380     return (double)proc_precisionSum/docNumber;
381   }
382   public double getRecallAverageProc() {
383     return (double)proc_recallSum/docNumber;
384   }
385 
386 
387   public boolean isGenerateMode() {
388     return isGenerateMode == true;
389   }//isGenerateMode
390 
391   public double getThreshold() {
392     return threshold;
393   }
394 
395   public void setThreshold(double newValue) {
396     threshold = newValue;
397   }
398 
399   public File getStartDirectory() {
400     return startDir;
401   }//getStartDirectory
402 
403   public void setStartDirectory(File dir) {
404     startDir = dir;
405   }//setStartDirectory
406 
407   protected void generateCorpus(File fileDir, File outputDir) {
408     //1. check if we have input files
409     if (fileDir == null)
410       return;
411     //2. create the output directory or clean it up if needed
412     File outDir = outputDir;
413     if (outputDir == null) {
414       outDir = new File(currDir, PROCESSED_DIR_NAME);
415     } else {
416       // get rid of the directory, coz datastore wants it clean
417       if (!Files.rmdir(outDir))
418         Out.prln("cannot delete old output directory: " + outDir);
419     }
420     outDir.mkdir();
421 
422     //create the datastore and process each document
423     try {
424       SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
425       sds.create();
426       sds.open();
427 
428       File[] files = fileDir.listFiles();
429       for (int i=0; i < files.length; i++) {
430         if (!files[i].isFile())
431           continue;
432         // create a document
433         Out.prln("Processing and storing document: " + files[i].toURL() +"<P>");
434 
435         FeatureMap params = Factory.newFeatureMap();
436         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
437         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
438 
439         FeatureMap features = Factory.newFeatureMap();
440 //        Gate.setHiddenAttribute(features, true);
441 
442         // create the document
443         Document doc = (Document) Factory.createResource(
444           "gate.corpora.DocumentImpl", params, features
445         );
446 
447         doc.setName(files[i].getName());
448         if (doc == null)
449           continue;
450         processDocument(doc);
451         LanguageResource lr = sds.adopt(doc, null);
452         sds.sync(lr);
453         Factory.deleteResource(doc);
454         Factory.deleteResource(lr);
455       }//for
456       sds.close();
457     } catch (java.net.MalformedURLException ex) {
458       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
459     } catch (PersistenceException ex1) {
460       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
461     } catch (ResourceInstantiationException ex2) {
462       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
463     } catch (gate.security.SecurityException ex3) {
464       throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage());
465     }
466   }//generateCorpus
467 
468   protected void evaluateCorpus(File fileDir,
469                     File processedDir, File markedDir,
470                     File errorDir) {
471     //1. check if we have input files and the processed Dir
472     if (fileDir == null || !fileDir.exists())
473       return;
474     if (processedDir == null || !processedDir.exists())
475       //if the user wants evaluation of marked and stored that's not possible
476       if (isMarkedStored) {
477         Out.prln("Cannot evaluate because no processed documents exist.");
478         return;
479       }
480       else
481         isMarkedClean = true;
482 
483     // create the error directory or clean it up if needed
484     File errDir = null;
485     if(isMoreInfoMode) {
486       errDir = errorDir;
487       if (errDir == null) {
488         errDir = new File(currDir, ERROR_DIR_NAME);
489       }
490       else {
491         // get rid of the directory, coz we wants it clean
492         if (!Files.rmdir(errDir))
493           Out.prln("cannot delete old error directory: " + errDir);
494       }
495       Out.prln("Create error directory: " + errDir + "<BR><BR>");
496       errDir.mkdir();
497     }
498 
499     //looked for marked texts only if the directory exists
500     boolean processMarked = markedDir != null && markedDir.exists();
501     if (!processMarked && (isMarkedStored || isMarkedClean)) {
502         Out.prln("Cannot evaluate because no human-annotated documents exist.");
503         return;
504     }
505 
506     if (isMarkedStored) {
507       evaluateMarkedStored(markedDir, processedDir, errDir);
508       return;
509     } else if (isMarkedClean) {
510       evaluateMarkedClean(markedDir, fileDir, errDir);
511       return;
512     }
513 
514     Document persDoc = null;
515     Document cleanDoc = null;
516     Document markedDoc = null;
517 
518     //open the datastore and process each document
519     try {
520       //open the data store
521       DataStore sds = Factory.openDataStore
522                       ("gate.persist.SerialDataStore",
523                        processedDir.toURL().toExternalForm());
524 
525       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
526       for (int i=0; i < lrIDs.size(); i++) {
527         String docID = (String) lrIDs.get(i);
528 
529         //read the stored document
530         FeatureMap features = Factory.newFeatureMap();
531         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
532         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
533         FeatureMap hparams = Factory.newFeatureMap();
534 //        Gate.setHiddenAttribute(hparams, true);
535 
536         persDoc = (Document) Factory.createResource(
537                                     "gate.corpora.DocumentImpl",
538                                     features, hparams);
539 
540 
541         if(isMoreInfoMode) {
542           StringBuffer errName = new StringBuffer(persDoc.getName());
543           errName.replace(
544             persDoc.getName().lastIndexOf("."),
545             persDoc.getName().length(),
546             ".err");
547           Out.prln("<H2>" +
548                    "<a href=\"err/" + errName.toString() + "\">"
549                    + persDoc.getName() + "</a>" + "</H2>");
550         } else
551           Out.prln("<H2>" + persDoc.getName() + "</H2>");
552 
553         File cleanDocFile = new File(fileDir, persDoc.getName());
554         //try reading the original document from clean
555         if (! cleanDocFile.exists()) {
556           Out.prln("Warning: Cannot find original document " +
557                    persDoc.getName() + " in " + fileDir);
558         } else {
559           FeatureMap params = Factory.newFeatureMap();
560           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
561           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
562 
563           // create the document
564           cleanDoc = (Document) Factory.createResource(
565                                   "gate.corpora.DocumentImpl", params, hparams);
566           cleanDoc.setName(persDoc.getName());
567         }
568 
569         //try finding the marked document
570         StringBuffer docName = new StringBuffer(persDoc.getName());
571         if (! isMarkedDS) {
572           docName.replace(
573             persDoc.getName().lastIndexOf("."),
574             docName.length(),
575             ".xml");
576           File markedDocFile = new File(markedDir, docName.toString());
577           if (! processMarked || ! markedDocFile.exists()) {
578             Out.prln("Warning: Cannot find human-annotated document " +
579                      markedDocFile + " in " + markedDir);
580           } else {
581             FeatureMap params = Factory.newFeatureMap();
582             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
583             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
584 
585             // create the document
586             markedDoc = (Document) Factory.createResource(
587                                      "gate.corpora.DocumentImpl", params, hparams);
588             markedDoc.setName(persDoc.getName());
589           }
590         } else {
591           //open marked from a DS
592           //open the data store
593           DataStore sds1 = Factory.openDataStore
594                           ("gate.persist.SerialDataStore",
595                            markedDir.toURL().toExternalForm());
596 
597           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
598           boolean found = false;
599           int k = 0;
600           //search for the marked doc with the same name
601           while (k < lrIDs1.size() && !found) {
602             String docID1 = (String) lrIDs1.get(k);
603 
604             //read the stored document
605             FeatureMap features1 = Factory.newFeatureMap();
606             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
607             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
608             Document tempDoc = (Document) Factory.createResource(
609                                         "gate.corpora.DocumentImpl",
610                                         features1, hparams);
611             //check whether this is our doc
612             if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
613                  endsWith(persDoc.getName())) {
614               found = true;
615               markedDoc = tempDoc;
616             } else k++;
617           }
618         }
619 
620         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
621         if (persDoc != null)
622           Factory.deleteResource(persDoc);
623         if (cleanDoc != null)
624           Factory.deleteResource(cleanDoc);
625         if (markedDoc != null)
626           Factory.deleteResource(markedDoc);
627 
628       }//for loop through saved docs
629       sds.close();
630     } catch (java.net.MalformedURLException ex) {
631       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
632     } catch (PersistenceException ex1) {
633       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
634     } catch (ResourceInstantiationException ex2) {
635       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
636     }
637 
638   }//evaluateCorpus
639 
640   protected void evaluateMarkedStored(File markedDir, File storedDir, File errDir) {
641     Document persDoc = null;
642     Document cleanDoc = null;
643     Document markedDoc = null;
644 
645     //open the datastore and process each document
646     try {
647       //open the data store
648       DataStore sds = Factory.openDataStore
649                       ("gate.persist.SerialDataStore",
650                        storedDir.toURL().toExternalForm());
651 
652       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
653       for (int i=0; i < lrIDs.size(); i++) {
654         String docID = (String) lrIDs.get(i);
655 
656         //read the stored document
657         FeatureMap features = Factory.newFeatureMap();
658         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
659         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
660 
661         FeatureMap hparams = Factory.newFeatureMap();
662 //        Gate.setHiddenAttribute(hparams, true);
663 
664 
665         persDoc = (Document) Factory.createResource(
666                                     "gate.corpora.DocumentImpl",
667                                     features, hparams);
668 
669         if(isMoreInfoMode) {
670           StringBuffer errName = new StringBuffer(persDoc.getName());
671           errName.replace(
672             persDoc.getName().lastIndexOf("."),
673             persDoc.getName().length(),
674             ".err");
675           Out.prln("<H2>" +
676                    "<a href=\"err/" + errName.toString() + "\">"
677                    + persDoc.getName() + "</a>" + "</H2>");
678         } else
679           Out.prln("<H2>" + persDoc.getName() + "</H2>");
680 
681         if (! this.isMarkedDS) { //try finding the marked document as file
682           StringBuffer docName = new StringBuffer(persDoc.getName());
683           docName.replace(
684             persDoc.getName().lastIndexOf("."),
685             docName.length(),
686             ".xml");
687           File markedDocFile = new File(markedDir, docName.toString());
688           if (! markedDocFile.exists()) {
689             Out.prln("Warning: Cannot find human-annotated document " +
690                      markedDocFile + " in " + markedDir);
691           } else {
692             FeatureMap params = Factory.newFeatureMap();
693             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
694             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
695 
696             // create the document
697             markedDoc = (Document) Factory.createResource(
698                                      "gate.corpora.DocumentImpl", params, hparams);
699             markedDoc.setName(persDoc.getName());
700           }//find marked as file
701         } else {
702           try {
703             //open marked from a DS
704             //open the data store
705             DataStore sds1 = Factory.openDataStore
706                             ("gate.persist.SerialDataStore",
707                              markedDir.toURL().toExternalForm());
708 
709             List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
710             boolean found = false;
711             int k = 0;
712             //search for the marked doc with the same name
713             while (k < lrIDs1.size() && !found) {
714               String docID1 = (String) lrIDs1.get(k);
715 
716               //read the stored document
717               FeatureMap features1 = Factory.newFeatureMap();
718               features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
719               features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
720               Document tempDoc = (Document) Factory.createResource(
721                                           "gate.corpora.DocumentImpl",
722                                           features1, hparams);
723               //check whether this is our doc
724               if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
725                    endsWith(persDoc.getName())) {
726                 found = true;
727                 markedDoc = tempDoc;
728               } else k++;
729             }
730           } catch (java.net.MalformedURLException ex) {
731             Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
732           } catch (gate.persist.PersistenceException ex1) {
733             Out.prln("Error opening marked as a datastore (-marked_ds specified)");
734           } catch (gate.creole.ResourceInstantiationException ex2) {
735             Out.prln("Error opening marked as a datastore (-marked_ds specified)");
736           }
737         }
738 
739         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
740         if (persDoc != null)
741           Factory.deleteResource(persDoc);
742         if (markedDoc != null)
743           Factory.deleteResource(markedDoc);
744 
745       }//for loop through saved docs
746       sds.close();
747 
748     } catch (java.net.MalformedURLException ex) {
749       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
750     } catch (PersistenceException ex1) {
751       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
752     } catch (ResourceInstantiationException ex2) {
753       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
754     }
755 
756   }//evaluateMarkedStored
757 
758 
759   protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
760     Document persDoc = null;
761     Document cleanDoc = null;
762     Document markedDoc = null;
763 
764     File[] cleanDocs = cleanDir.listFiles();
765     for (int i = 0; i< cleanDocs.length; i++) {
766       if (!cleanDocs[i].isFile())
767         continue;
768 
769       //try reading the original document from clean
770       FeatureMap params = Factory.newFeatureMap();
771       try {
772         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
773       } catch (java.net.MalformedURLException ex) {
774         Out.prln("Cannot create document from file: " +
775           cleanDocs[i].getAbsolutePath());
776         continue;
777       }
778       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
779 
780       FeatureMap hparams = Factory.newFeatureMap();
781 //      Gate.setHiddenAttribute(hparams, true);
782 
783       // create the document
784       try {
785         cleanDoc = (Document) Factory.createResource(
786                               "gate.corpora.DocumentImpl", params, hparams, cleanDocs[i].getName());
787       } catch (gate.creole.ResourceInstantiationException ex) {
788         Out.prln("Cannot create document from file: " +
789           cleanDocs[i].getAbsolutePath());
790         continue;
791       }
792 
793       if(isMoreInfoMode) {
794         StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
795         errName.replace(
796           cleanDocs[i].getName().lastIndexOf("."),
797           cleanDocs[i].getName().length(),
798           ".err");
799         Out.prln("<H2>" +
800                  "<a href=\"err/" + errName.toString() + "\">"
801                  + cleanDocs[i].getName() + "</a>" + "</H2>");
802       } else
803         Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>");
804 
805       //try finding the marked document
806       if (! isMarkedDS) {
807         StringBuffer docName = new StringBuffer(cleanDoc.getName());
808         docName.replace(
809           cleanDoc.getName().lastIndexOf("."),
810           docName.length(),
811           ".xml");
812         File markedDocFile = new File(markedDir, docName.toString());
813         if (! markedDocFile.exists()) {
814           Out.prln("Warning: Cannot find human-annotated document " +
815                    markedDocFile + " in " + markedDir);
816           continue;
817         } else {
818           params = Factory.newFeatureMap();
819           try {
820             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
821           } catch (java.net.MalformedURLException ex) {
822             Out.prln("Cannot create document from file: " +
823               markedDocFile.getAbsolutePath());
824             continue;
825           }
826           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
827 
828           // create the document
829           try {
830             markedDoc = (Document) Factory.createResource(
831                                    "gate.corpora.DocumentImpl", params,
832                                    hparams, cleanDoc.getName());
833           } catch (gate.creole.ResourceInstantiationException ex) {
834             Out.prln("Cannot create document from file: " +
835               markedDocFile.getAbsolutePath());
836             continue;
837           }
838 
839         }//if markedDoc exists
840       } else {
841         try {
842           //open marked from a DS
843           //open the data store
844           DataStore sds1 = Factory.openDataStore
845                           ("gate.persist.SerialDataStore",
846                            markedDir.toURL().toExternalForm());
847 
848           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
849           boolean found = false;
850           int k = 0;
851           //search for the marked doc with the same name
852           while (k < lrIDs1.size() && !found) {
853             String docID1 = (String) lrIDs1.get(k);
854 
855             //read the stored document
856             FeatureMap features1 = Factory.newFeatureMap();
857             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
858             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
859             Document tempDoc = (Document) Factory.createResource(
860                                         "gate.corpora.DocumentImpl",
861                                         features1, hparams);
862             //check whether this is our doc
863             if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
864                  endsWith(cleanDoc.getName())) {
865               found = true;
866               markedDoc = tempDoc;
867             } else k++;
868           }
869         } catch (java.net.MalformedURLException ex) {
870           Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
871         } catch (gate.persist.PersistenceException ex1) {
872           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
873         } catch (gate.creole.ResourceInstantiationException ex2) {
874           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
875         }
876       } //if using a DS for marked
877 
878       try {
879         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
880       } catch (gate.creole.ResourceInstantiationException ex) {
881         ex.printStackTrace();
882         Out.prln("Evaluate failed on document: " + cleanDoc.getName());
883       }
884       if (persDoc != null)
885         Factory.deleteResource(persDoc);
886       if (cleanDoc != null)
887         Factory.deleteResource(cleanDoc);
888       if (markedDoc != null)
889         Factory.deleteResource(markedDoc);
890 
891     }//for loop through clean docs
892 
893 
894   }//evaluateMarkedClean
895 
896   protected void processDocument(Document doc) {
897     try {
898       if (application instanceof CorpusController) {
899         Corpus tempCorpus = Factory.newCorpus("temp");
900         tempCorpus.add(doc);
901         ((CorpusController)application).setCorpus(tempCorpus);
902         application.execute();
903         Factory.deleteResource(tempCorpus);
904         tempCorpus = null;
905       } else {
906         Iterator iter = application.getPRs().iterator();
907         while (iter.hasNext())
908           ((ProcessingResource) iter.next()).setParameterValue("document", doc);
909         application.execute();
910       }
911     } catch (ResourceInstantiationException ex) {
912       throw new RuntimeException("Error executing application: "
913                                     + ex.getMessage());
914     } catch (ExecutionException ex) {
915       throw new RuntimeException("Error executing application: "
916                                     + ex.getMessage());
917     }
918   }
919 
920   protected void evaluateDocuments(Document persDoc,
921                     Document cleanDoc, Document markedDoc,
922                     File errDir)
923                         throws ResourceInstantiationException {
924     if (cleanDoc == null && markedDoc == null)
925       return;
926 
927     //we've got no types to compare
928     if (annotTypes == null || annotTypes.isEmpty())
929       return;
930 
931     if (cleanDoc != null && !isMarkedStored) {
932 
933       processDocument(cleanDoc);
934 
935 
936       int wordCount = countWords(cleanDoc);
937       if(wordCount == 0)
938         Out.prln("<BR>No Token annotations to count words in the document.");
939       else
940         Out.prln("<BR>Word count: " + wordCount);
941       corpusWordCount += wordCount;
942 
943       if(!isMarkedClean)
944         evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
945       else
946         evaluateTwoDocs(markedDoc, cleanDoc, errDir);
947 
948     } else
949       evaluateTwoDocs(markedDoc, persDoc, errDir);
950 
951   }
952 
953   /**
954    * Count all Token.kind=word annotations in the document
955    */
956   protected int countWords(Document annotDoc) {
957     int count = 0;
958 
959     if (annotDoc == null) return 0;
960     // check for Token in outputSetName
961     AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
962     if (tokens == null) return 0;
963 
964     Iterator it = tokens.iterator();
965     Annotation currAnnotation;
966     while (it.hasNext()) {
967       currAnnotation = (Annotation) it.next();
968       Object feature = currAnnotation.getFeatures().get("kind");
969       if(feature != null && "word".equalsIgnoreCase((String)feature)) ++count;
970     } // while
971 
972     return count;
973   }
974 
975   protected void evaluateAllThree(Document persDoc,
976                                   Document cleanDoc, Document markedDoc,
977                                   File errDir)
978                                   throws ResourceInstantiationException {
979     //first start the table and its header
980     printTableHeader();
981 
982     // store annotation diff in .err file
983     Writer errWriter = null;
984     if (isMoreInfoMode && errDir != null) {
985       StringBuffer docName = new StringBuffer(cleanDoc.getName());
986       docName.replace(
987           cleanDoc.getName().lastIndexOf("."),
988           docName.length(),
989           ".err");
990       File errFile = new File(errDir, docName.toString());
991       String encoding = ((gate.corpora.DocumentImpl)cleanDoc).getEncoding();
992       try {
993         errWriter = new FileWriter(errFile, false);
994         /*
995         if(encoding == null) {
996           errWriter = new OutputStreamWriter(
997               new FileOutputStream(errFile, false));
998         } else {
999           errWriter = new OutputStreamWriter(
1000              new FileOutputStream(errFile, false), encoding);
1001        }*/
1002      }
1003      catch (Exception ex) {
1004        Out.prln("Exception when creating the error file " + errFile + ": "
1005                 + ex.getMessage());
1006        errWriter = null;
1007      }
1008    }
1009
1010    for (int jj= 0; jj< annotTypes.size(); jj++) {
1011      String annotType = (String) annotTypes.get(jj);
1012
1013      AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1014      //we don't have this annotation type in this document
1015      if (annotDiffer == null)
1016        continue;
1017
1018      //increase the number of processed documents
1019      docNumber++;
1020      //add precison and recall to the sums
1021      updateStatistics(annotDiffer, annotType);
1022
1023      AnnotationDiffer annotDiffer1 =
1024        measureDocs(markedDoc, persDoc, annotType);
1025
1026      Out.prln("<TR>");
1027
1028      if(isMoreInfoMode && annotDiffer1 != null
1029         && (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1030         || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1031         )
1032        Out.prln("<TD> " + annotType + "_new"+ "</TD>");
1033      else
1034        Out.prln("<TD> " + annotType + "</TD>");
1035
1036      if (isMoreInfoMode) {
1037        if(annotDiffer1 != null) updateStatisticsProc(annotDiffer1, annotType);
1038
1039        Out.prln("<TD>" + annotDiffer.getCorrectMatches() + "</TD>");
1040        Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() + "</TD>");
1041        Out.prln("<TD>" + annotDiffer.getMissing() + "</TD>");
1042        Out.prln("<TD>" + annotDiffer.getSpurious() + "</TD>");
1043      }
1044
1045      Out.prln("<TD>");
1046
1047      //check the precision first
1048      if (annotDiffer1 != null) {
1049
1050        if (annotDiffer1.getPrecisionAverage()
1051              < annotDiffer.getPrecisionAverage()) {
1052            Out.prln("<P><Font color=blue> ");
1053            Out.prln(annotDiffer.getPrecisionAverage());
1054
1055            if(!isMoreInfoMode) {
1056              Out.pr("<BR>Precision increase on human-marked from ");
1057              Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1058              Out.prln(annotDiffer.getPrecisionAverage());
1059            }
1060            Out.prln(" </Font></P>");
1061          }
1062        else if (annotDiffer1.getPrecisionAverage()
1063               > annotDiffer.getPrecisionAverage()) {
1064          Out.prln("<P><Font color=red> ");
1065          Out.prln(annotDiffer.getPrecisionAverage());
1066
1067          if(!isMoreInfoMode) {
1068            Out.pr("<BR>Precision decrease on human-marked from ");
1069            Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1070            Out.prln(annotDiffer.getPrecisionAverage());
1071          }
1072          Out.prln(" </Font></P>");
1073        }
1074        else
1075          Out.prln("<P> " + (double) annotDiffer.getPrecisionAverage() + " </P>");
1076      }
1077      else
1078        Out.prln("<P> " + annotDiffer.getPrecisionAverage() + " </P>");
1079
1080      Out.prln("</TD>");
1081
1082      Out.prln("<TD>");
1083
1084      //check the recall now
1085      if (annotDiffer1 != null) {
1086
1087        if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1088          Out.prln("<P><Font color=blue> ");
1089          Out.prln(annotDiffer.getRecallAverage());
1090
1091          if(!isMoreInfoMode) {
1092            Out.pr("<BR>Recall increase on human-marked from ");
1093            Out.pr(annotDiffer1.getRecallAverage() + " to ");
1094            Out.prln(annotDiffer.getRecallAverage());
1095          }
1096          Out.prln(" </Font></P>");
1097        }
1098        else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1099          Out.prln("<P><Font color=red> ");
1100          Out.prln(annotDiffer.getRecallAverage());
1101
1102          if(!isMoreInfoMode) {
1103            Out.pr("<BR>Recall decrease on human-marked from ");
1104            Out.pr(annotDiffer1.getRecallAverage() + " to ");
1105            Out.prln(annotDiffer.getRecallAverage());
1106          }
1107          Out.prln(" </Font></P>");
1108        }
1109        else
1110          Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1111      } else
1112        Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1113
1114
1115      Out.prln("</TD>");
1116
1117      //check the recall now
1118      if ( isVerboseMode ) {
1119        Out.prln("<TD>");
1120        if (annotDiffer.getRecallAverage() < threshold) {
1121          printAnnotations(annotDiffer, markedDoc, cleanDoc);
1122        }
1123        else {
1124          Out.prln("&nbsp;");
1125        }
1126        Out.prln("</TD>");
1127      }
1128
1129      Out.prln("</TR>");
1130
1131      // show one more table line for processed document
1132      if(isMoreInfoMode && annotDiffer1 != null
1133         && (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1134         || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1135         ) {
1136
1137        Out.prln("<TR>");
1138        Out.prln("<TD> " + annotType + "_old" + "</TD>");
1139
1140        Out.prln("<TD>" + annotDiffer1.getCorrectMatches() + "</TD>");
1141        Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() + "</TD>");
1142        Out.prln("<TD>" + annotDiffer1.getMissing() + "</TD>");
1143        Out.prln("<TD>" + annotDiffer1.getSpurious() + "</TD>");
1144
1145        Out.prln("<TD>");
1146        if (annotDiffer1.getPrecisionAverage() < annotDiffer.getPrecisionAverage())
1147
1148          Out.prln("<P><Font color=blue> "  + annotDiffer1.getPrecisionAverage()
1149                + "</Font></P>");
1150        else if (annotDiffer1.getPrecisionAverage() > annotDiffer.getPrecisionAverage())
1151          Out.prln(
1152             "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1153             + " </Font></P>");
1154        else
1155          Out.prln(annotDiffer1.getPrecisionAverage());
1156
1157        Out.prln("</TD>");
1158
1159        Out.prln("<TD>");
1160        if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1161          Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1162                   + " </Font></P>");
1163        else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1164          Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1165                    + " </Font></P>");
1166        else
1167           Out.prln(annotDiffer1.getRecallAverage());
1168
1169        Out.prln("</TD>");
1170
1171        //check the recall now
1172        if ( isVerboseMode ) {
1173          // create error file and start writing
1174
1175          Out.prln("<TD>");
1176          if (annotDiffer.getRecallAverage() < threshold) {
1177            printAnnotations(annotDiffer, markedDoc, cleanDoc);
1178          }
1179          else {
1180            Out.prln("&nbsp;");
1181          }
1182          Out.prln("</TD>");
1183        }
1184        Out.prln("</TR>");
1185      } // if(isMoreInfoMode && annotDiff1 != null)
1186
1187      if (isMoreInfoMode && errDir != null)
1188        storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1189    }//for loop through annotation types
1190    Out.prln("</TABLE>");
1191
1192    try {
1193      if(errWriter != null)
1194        errWriter.close();
1195    }
1196    catch (Exception ex) {
1197      Out.prln("Exception on close of error file " + errWriter + ": "
1198               + ex.getMessage());
1199    }
1200  }//evaluateAllThree
1201
1202  protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1203                                 File errDir)
1204        throws ResourceInstantiationException {
1205
1206    //first start the table and its header
1207    printTableHeader();
1208
1209    // store annotation diff in .err file
1210    Writer errWriter = null;
1211    if (isMoreInfoMode && errDir != null) {
1212      StringBuffer docName = new StringBuffer(keyDoc.getName());
1213      docName.replace(
1214          keyDoc.getName().lastIndexOf("."),
1215          docName.length(),
1216          ".err");
1217      File errFile = new File(errDir, docName.toString());
1218      String encoding = ((gate.corpora.DocumentImpl)keyDoc).getEncoding();
1219      try {
1220        errWriter = new FileWriter(errFile, false);
1221        /*
1222        if(encoding == null) {
1223          errWriter = new OutputStreamWriter(
1224              new FileOutputStream(errFile, false));
1225        } else {
1226          errWriter = new OutputStreamWriter(
1227              new FileOutputStream(errFile, false), encoding);
1228        }*/
1229      }
1230      catch (Exception ex) {
1231        Out.prln("Exception when creating the error file " + errFile + ": "
1232                 + ex.getMessage());
1233        errWriter = null;
1234      }
1235    }
1236
1237    for (int jj= 0; jj< annotTypes.size(); jj++) {
1238      String annotType = (String) annotTypes.get(jj);
1239
1240      AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1241      //we don't have this annotation type in this document
1242      if (annotDiff == null)
1243        continue;
1244
1245      //increase the number of processed documents
1246      docNumber++;
1247      //add precison and recall to the sums
1248      updateStatistics(annotDiff, annotType);
1249
1250      Out.prln("<TR>");
1251      Out.prln("<TD>" + annotType + "</TD>");
1252
1253      if(isMoreInfoMode) {
1254        Out.prln("<TD>" + annotDiff.getCorrectMatches() + "</TD>");
1255        Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() + "</TD>");
1256        Out.prln("<TD>" + annotDiff.getMissing() + "</TD>");
1257        Out.prln("<TD>" + annotDiff.getSpurious() + "</TD>");
1258      }
1259
1260      Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
1261      Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
1262      //check the recall now
1263      if ( isVerboseMode ) {
1264        Out.prln("<TD>");
1265        if (annotDiff.getRecallAverage() < threshold) {
1266          printAnnotations(annotDiff, keyDoc, respDoc);
1267        }
1268        else {
1269          Out.prln("&nbsp;");
1270        }
1271        Out.prln("</TD>");
1272      }
1273      Out.prln("</TR>");
1274
1275      if (isMoreInfoMode && errDir != null)
1276        storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1277    }//for loop through annotation types
1278    Out.prln("</TABLE>");
1279
1280    try {
1281      if(errWriter != null)
1282        errWriter.close();
1283    }
1284    catch (Exception ex) {
1285      Out.prln("Exception on close of error file " + errWriter + ": "
1286               + ex.getMessage());
1287    }
1288  }//evaluateTwoDocs
1289
1290  protected void printTableHeader() {
1291    Out.prln("<TABLE BORDER=1");
1292    Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1293
1294    if(isMoreInfoMode)
1295     Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1296             + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1297
1298    Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1299
1300    if (isVerboseMode)
1301      Out.pr("<TD><B>Annotations</B></TD>");
1302
1303    Out.prln("</TR>");
1304  }
1305
1306  protected void updateStatistics(AnnotationDiffer annotDiffer, String annotType){
1307    double precisionAverage = ((double)((double)annotDiffer.getPrecisionLenient() + annotDiffer.getPrecisionStrict()) /
1308(double)(2.0));
1309    if(precisionAverage != Double.NaN)
1310      precisionSum += precisionAverage;
1311
1312    double recallAverage = ((double)(annotDiffer.getRecallLenient() + annotDiffer.getRecallStrict()) / (double) (2.0));
1313    if(recallAverage != Double.NaN)
1314      recallSum += recallAverage;
1315
1316    double fMeasureAverage = ((double) (annotDiffer.getFMeasureLenient(1.0) + annotDiffer.getFMeasureStrict(1.0)) /
1317(double) (2.0));
1318    if(fMeasureAverage != Double.NaN)
1319      fMeasureSum += fMeasureAverage;
1320
1321    Double oldPrecision = (Double) precisionByType.get(annotType);
1322    if (oldPrecision == null)
1323        precisionByType.put(annotType, new Double(precisionAverage));
1324    else
1325        precisionByType.put(annotType, new Double(oldPrecision.doubleValue() + precisionAverage));
1326
1327    Integer precCount = (Integer) prCountByType.get(annotType);
1328    if (precCount == null)
1329        prCountByType.put(annotType, new Integer(1));
1330    else
1331       prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1332
1333
1334    Double oldFMeasure = (Double) fMeasureByType.get(annotType);
1335    if (oldFMeasure == null)
1336       fMeasureByType.put(annotType, new Double(fMeasureAverage));
1337    else
1338       fMeasureByType.put(annotType, new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1339
1340    Integer fCount = (Integer) fMeasureCountByType.get(annotType);
1341    if (fCount == null)
1342       fMeasureCountByType.put(annotType, new Integer(1));
1343    else
1344       fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1345
1346    Double oldRecall = (Double) recallByType.get(annotType);
1347    if (oldRecall == null)
1348       recallByType.put(annotType, new Double(recallAverage));
1349    else
1350       recallByType.put(annotType, new Double(oldRecall.doubleValue() + recallAverage));
1351
1352    Integer recCount = (Integer) recCountByType.get(annotType);
1353    if (recCount == null)
1354       recCountByType.put(annotType, new Integer(1));
1355    else
1356       recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1357
1358    //Update the missing, spurious, correct, and partial counts
1359    Long oldMissingNo = (Long) missingByType.get(annotType);
1360    if (oldMissingNo == null)
1361       missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1362    else
1363       missingByType.put(annotType, new Long(oldMissingNo.longValue() + annotDiffer.getMissing()));
1364
1365    Long oldCorrectNo = (Long) correctByType.get(annotType);
1366    if (oldCorrectNo == null)
1367       correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1368    else
1369       correctByType.put(annotType, new Long(oldCorrectNo.longValue() + annotDiffer.getCorrectMatches()));
1370
1371    Long oldPartialNo = (Long) partialByType.get(annotType);
1372    if (oldPartialNo == null)
1373       partialByType.put(annotType, new Long(annotDiffer.getPartiallyCorrectMatches()));
1374    else
1375       partialByType.put(annotType, new Long(oldPartialNo.longValue() + annotDiffer.getPartiallyCorrectMatches()));
1376
1377    Long oldSpuriousNo = (Long) spurByType.get(annotType);
1378    if (oldSpuriousNo == null)
1379       spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1380    else
1381       spurByType.put(annotType, new Long(oldSpuriousNo.longValue() + annotDiffer.getSpurious()));
1382  }
1383
1384  /**
1385   * Update statistics for processed documents
1386   * The same procedure as updateStatistics with different hashTables
1387   */
1388  protected void updateStatisticsProc(AnnotationDiffer annotDiffer, String annotType){
1389    hasProcessed = true;
1390    double precisionAverage = ((double)(annotDiffer.getPrecisionLenient() + annotDiffer.getPrecisionStrict()) /
1391(double)(2.0));
1392    if(precisionAverage != Double.NaN)
1393      proc_precisionSum += precisionAverage;
1394
1395    double recallAverage = ((double)(annotDiffer.getRecallLenient() + annotDiffer.getRecallStrict()) / (double) (2.0));
1396    if(recallAverage != Double.NaN)
1397      proc_recallSum += recallAverage;
1398
1399    double fMeasureAverage = ((double) (annotDiffer.getFMeasureLenient(1.0) + annotDiffer.getFMeasureStrict(1.0)) /
1400(double) (2.0));
1401    if(fMeasureAverage != Double.NaN)
1402      proc_fMeasureSum += fMeasureAverage;
1403
1404    Double oldPrecision = (Double) proc_precisionByType.get(annotType);
1405    if (oldPrecision == null)
1406        proc_precisionByType.put(annotType, new Double(precisionAverage));
1407      else
1408        proc_precisionByType.put(annotType,
1409                            new Double(oldPrecision.doubleValue() +
1410                                       precisionAverage));
1411      Integer precCount = (Integer) proc_prCountByType.get(annotType);
1412      if (precCount == null)
1413        proc_prCountByType.put(annotType, new Integer(1));
1414      else
1415        proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1416
1417
1418      Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType);
1419      if (oldFMeasure == null)
1420        proc_fMeasureByType.put(annotType,
1421                         new Double(fMeasureAverage));
1422      else
1423        proc_fMeasureByType.put(annotType,
1424                         new Double(oldFMeasure.doubleValue() +
1425                                    fMeasureAverage));
1426      Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType);
1427      if (fCount == null)
1428        proc_fMeasureCountByType.put(annotType, new Integer(1));
1429      else
1430        proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1431
1432      Double oldRecall = (Double) proc_recallByType.get(annotType);
1433      if (oldRecall == null)
1434        proc_recallByType.put(annotType,
1435                            new Double(recallAverage));
1436      else
1437        proc_recallByType.put(annotType,
1438                            new Double(oldRecall.doubleValue() +
1439                                       recallAverage));
1440      Integer recCount = (Integer) proc_recCountByType.get(annotType);
1441      if (recCount == null)
1442        proc_recCountByType.put(annotType, new Integer(1));
1443      else
1444        proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1445
1446      //Update the missing, spurious, correct, and partial counts
1447      Long oldMissingNo = (Long) proc_missingByType.get(annotType);
1448      if (oldMissingNo == null)
1449        proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1450      else
1451        proc_missingByType.put(annotType,
1452                        new Long(oldMissingNo.longValue() +
1453                                  annotDiffer.getMissing()));
1454
1455      Long oldCorrectNo = (Long) proc_correctByType.get(annotType);
1456      if (oldCorrectNo == null)
1457        proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1458      else
1459        proc_correctByType.put(annotType,
1460                        new Long(oldCorrectNo.longValue() +
1461                                  annotDiffer.getCorrectMatches()));
1462
1463      Long oldPartialNo = (Long) proc_partialByType.get(annotType);
1464      if (oldPartialNo == null)
1465        proc_partialByType.put(annotType, new Long(annotDiffer.getPartiallyCorrectMatches()));
1466      else
1467        proc_partialByType.put(annotType,
1468                        new Long(oldPartialNo.longValue() +
1469                                  annotDiffer.getPartiallyCorrectMatches()));
1470
1471      Long oldSpuriousNo = (Long) proc_spurByType.get(annotType);
1472      if (oldSpuriousNo == null)
1473        proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1474      else
1475        proc_spurByType.put(annotType,
1476                        new Long(oldSpuriousNo.longValue() +
1477                                  annotDiffer.getSpurious()));
1478  }
1479
1480  public void printStatistics() {
1481
1482    Out.prln("<H2> Statistics </H2>");
1483
1484/*
1485    Out.prln("<H3> Precision </H3>");
1486    if (precisionByType != null && !precisionByType.isEmpty()) {
1487      Iterator iter = precisionByType.keySet().iterator();
1488      while (iter.hasNext()) {
1489        String annotType = (String) iter.next();
1490        Out.prln(annotType + ": "
1491          + ((Double)precisionByType.get(annotType)).doubleValue()
1492              /
1493              ((Integer)prCountByType.get(annotType)).intValue()
1494          + "<P>");
1495      }//while
1496    }
1497    Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
1498
1499    Out.prln("<H3> Recall </H3>");
1500    if (recallByType != null && !recallByType.isEmpty()) {
1501      Iterator iter = recallByType.keySet().iterator();
1502      while (iter.hasNext()) {
1503        String annotType = (String) iter.next();
1504        Out.prln(annotType + ": "
1505          + ((Double)recallByType.get(annotType)).doubleValue()
1506              /
1507              ((Integer)recCountByType.get(annotType)).intValue()
1508          + "<P>");
1509      }//while
1510    }
1511
1512    Out.prln("Overall recall: " + getRecallAverage()
1513             + "<P>");
1514*/
1515    if (annotTypes == null) {
1516      Out.prln("No types given for evaluation, cannot obtain precision/recall");
1517      return;
1518    }
1519    Out.prln("<table border=1>");
1520    Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1521              "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1522              "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1523              "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1524    String annotType;
1525    for (int i = 0; i < annotTypes.size(); i++) {
1526      annotType = (String) annotTypes.get(i);
1527      printStatsForType(annotType);
1528    }//for
1529    Out.prln("</table>");
1530  } // updateStatisticsProc
1531
1532  protected void printStatsForType(String annotType){
1533    long correct = (correctByType.get(annotType) == null)? 0 :
1534                      ((Long)correctByType.get(annotType)).longValue();
1535    long partial = (partialByType.get(annotType) == null)? 0 :
1536                      ((Long)partialByType.get(annotType)).longValue();
1537    long spurious = (spurByType.get(annotType) == null)? 0 :
1538                      ((Long)spurByType.get(annotType)).longValue();
1539    long missing = (missingByType.get(annotType) == null)? 0:
1540                      ((Long)missingByType.get(annotType)).longValue();
1541    long actual = correct + partial + spurious;
1542    long possible = correct + partial + missing;
1543    //precision strict is correct/actual
1544    //precision is (correct + 0.5 * partially correct)/actual
1545
1546    double precision = (correct + 0.5 * partial) / actual;
1547    //recall strict is correct/possible
1548    double recall = (correct + 0.5*partial)/possible;
1549    //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1550    double fmeasure =
1551      ((beta*beta + 1)*precision*recall)
1552      /
1553      ((beta*beta*precision) + recall);
1554
1555
1556    long proc_correct=0;
1557    long proc_partial=0;
1558    long proc_spurious=0;
1559    long proc_missing=0;
1560    long proc_actual=0;
1561    long proc_possible=0;
1562    double proc_precision=0;
1563    double proc_recall=0;
1564    double proc_fmeasure=0;
1565
1566    if(hasProcessed) {
1567      // calculate values for processed
1568      proc_correct = (proc_correctByType.get(annotType) == null)? 0 :
1569                        ((Long)proc_correctByType.get(annotType)).longValue();
1570      proc_partial = (proc_partialByType.get(annotType) == null)? 0 :
1571                        ((Long)proc_partialByType.get(annotType)).longValue();
1572      proc_spurious = (proc_spurByType.get(annotType) == null)? 0 :
1573                        ((Long)proc_spurByType.get(annotType)).longValue();
1574      proc_missing = (proc_missingByType.get(annotType) == null)? 0:
1575                        ((Long)proc_missingByType.get(annotType)).longValue();
1576      proc_actual = proc_correct + proc_partial + proc_spurious;
1577      proc_possible = proc_correct + proc_partial + proc_missing;
1578      //precision strict is correct/actual
1579      //precision is (correct + 0.5 * partially correct)/actual
1580      proc_precision = (proc_correct + 0.5*proc_partial)/proc_actual;
1581      //recall strict is correct/possible
1582      proc_recall = (proc_correct + 0.5*proc_partial)/proc_possible;
1583      //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1584      proc_fmeasure =
1585        ((beta*beta + 1)*proc_precision*proc_recall)
1586        /
1587        ((beta*beta*proc_precision) + proc_recall);
1588
1589    }
1590
1591    // output data
1592    Out.prln("<TR>");
1593    if(hasProcessed)
1594      Out.prln("<TD>" + annotType+ "_new"  + "</TD>");
1595    else
1596      Out.prln("<TD>" + annotType + "</TD>");
1597
1598    Out.prln("<TD>" + correct + "</TD>");
1599    Out.prln("<TD>" + partial + "</TD>");
1600    Out.prln("<TD>" + missing + "</TD>");
1601    Out.prln("<TD>" + spurious + "</TD>");
1602
1603    String strPrec = (isMoreInfoMode)?
1604        avgPrint(precision, 4)
1605        :Double.toString(precision);
1606    String strRec = (isMoreInfoMode)?
1607        avgPrint(recall, 4)
1608        :Double.toString(recall);
1609    String strFmes = (isMoreInfoMode)?
1610        avgPrint(fmeasure, 4)
1611        :Double.toString(fmeasure);
1612
1613    if(hasProcessed && (precision < proc_precision))
1614      Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1615      else if(hasProcessed && (precision > proc_precision))
1616        Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1617        else
1618          Out.prln("<TD>" + strPrec + "</TD>");
1619    if(hasProcessed && (recall < proc_recall))
1620      Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1621      else if(hasProcessed && (recall > proc_recall))
1622        Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1623        else
1624          Out.prln("<TD>" + strRec + "</TD>");
1625    Out.prln("<TD>" + strFmes + "</TD>");
1626    Out.prln("</TR>");
1627
1628    if(hasProcessed) {
1629      // output data
1630      Out.prln("<TR>");
1631      Out.prln("<TD>" + annotType + "_old" + "</TD>");
1632
1633      Out.prln("<TD>" + proc_correct + "</TD>");
1634      Out.prln("<TD>" + proc_partial + "</TD>");
1635      Out.prln("<TD>" + proc_missing + "</TD>");
1636      Out.prln("<TD>" + proc_spurious + "</TD>");
1637
1638      String strProcPrec = (isMoreInfoMode)?
1639          avgPrint(proc_precision, 4)
1640          :Double.toString(proc_precision);
1641      String strProcRec = (isMoreInfoMode)?
1642          avgPrint(proc_recall, 4)
1643          :Double.toString(proc_recall);
1644      String strProcFmes = (isMoreInfoMode)?
1645          avgPrint(proc_fmeasure, 4)
1646          :Double.toString(proc_fmeasure);
1647
1648      if(precision < proc_precision)
1649        Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1650        else if(precision > proc_precision)
1651          Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1652          else
1653            Out.prln("<TD>" + strProcPrec + "</TD>");
1654      if(recall < proc_recall)
1655        Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1656        else if(recall > proc_recall)
1657          Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1658          else
1659            Out.prln("<TD>" + strProcRec + "</TD>");
1660      Out.prln("<TD>" + strProcFmes + "</TD>");
1661      Out.prln("</TR>");
1662    }
1663  }//printStatsForType
1664
1665  //** Print @param value with @param count digits after decimal point */
1666  protected String avgPrint(double value, int count) {
1667    double newvalue;
1668    double power = Math.pow(10, count);
1669    newvalue = Math.round( value * power )/ power;
1670    return Double.toString(newvalue);
1671  }
1672
1673
1674  private double precisionSumCalc = 0;
1675  private double recallSumCalc = 0;
1676  private double fMeasureSumCalc = 0;
1677
1678  public double getPrecisionAverageCalc() {
1679    return precisionSumCalc;
1680  }
1681
1682  public double getRecallAverageCalc() {
1683    return recallSumCalc;
1684  }
1685
1686  public double getFmeasureAverageCalc() {
1687    return fMeasureSumCalc;
1688  }
1689
1690  protected void calculateAvgTotal() {
1691    long correct, partial, spurious, missing;
1692    long correctSum, partialSum, spuriousSum, missingSum;
1693
1694    if (annotTypes == null) {
1695      return;
1696    }
1697    correctSum = partialSum = spuriousSum = missingSum = 0;
1698
1699    String annotType;
1700    for (int i = 0; i < annotTypes.size(); i++) {
1701      annotType = (String) annotTypes.get(i);
1702      correct = (correctByType.get(annotType) == null)? 0 :
1703                        ((Long)correctByType.get(annotType)).longValue();
1704      partial = (partialByType.get(annotType) == null)? 0 :
1705                        ((Long)partialByType.get(annotType)).longValue();
1706      spurious = (spurByType.get(annotType) == null)? 0 :
1707                        ((Long)spurByType.get(annotType)).longValue();
1708      missing = (missingByType.get(annotType) == null)? 0:
1709                        ((Long)missingByType.get(annotType)).longValue();
1710      correctSum += correct;
1711      partialSum += partial;
1712      spuriousSum += spurious;
1713      missingSum += missing;
1714    }//for
1715
1716    long actual = correctSum + partialSum + spuriousSum;
1717    long possible = correctSum + partialSum + missingSum;
1718
1719    if(actual == 0) {
1720      precisionSumCalc = 0;
1721    }
1722    else {
1723      precisionSumCalc = (correctSum + 0.5 * partialSum) / actual;
1724    }
1725
1726    if(possible == 0) {
1727      recallSumCalc = 0;
1728    }
1729    else {
1730      recallSumCalc = (correctSum + 0.5 * partialSum) / actual;
1731    }
1732
1733    if(precisionSumCalc == 0 && recallSumCalc == 0) {
1734      fMeasureSumCalc = 0;
1735    }
1736    else {
1737      fMeasureSumCalc =
1738        ((beta*beta + 1)*precisionSumCalc*recallSumCalc)
1739        /
1740        ((beta*beta*precisionSumCalc) + recallSumCalc);
1741
1742    }
1743  } // calculateAvgTotal
1744
1745  protected AnnotationDiffer measureDocs(
1746    Document keyDoc, Document respDoc, String annotType)
1747      throws ResourceInstantiationException {
1748
1749    if (keyDoc == null || respDoc == null)
1750      return null;
1751
1752    if (annotSetName != null
1753        && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1754      return null;
1755    else if ((annotSetName == null || annotSetName.equals(""))
1756        && keyDoc.getAnnotations().get(annotType) == null)
1757      return null;
1758
1759    // create an annotation diff
1760    AnnotationDiffer annotDiffer = new AnnotationDiffer();
1761    // set the feature names set for annotation differ
1762    annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1763    // we need to find the sets
1764    AnnotationSet keys, responses;
1765    if(annotSetName == null || annotSetName.equals("")) {
1766      keys = keyDoc.getAnnotations().get(annotType);
1767      responses = respDoc.getAnnotations().get(annotType);
1768    } else {
1769      keys = keyDoc.getAnnotations(annotSetName).get(annotType);
1770      responses = respDoc.getAnnotations(outputSetName).get(annotType);
1771    }
1772
1773    // we have annotation sets so call the annotationDiffer
1774    List pairings = annotDiffer.calculateDiff(keys,responses);
1775    return annotDiffer;
1776  } // measureDocs
1777
1778  protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
1779                  Document keyDoc, Document respDoc, Writer errFileWriter) {
1780    if(errFileWriter == null) return; // exit on "no file"
1781
1782    try {
1783      // extract and store annotations
1784      Comparator comp = new OffsetComparator();
1785      TreeSet sortedSet = new TreeSet(comp);
1786      Set missingSet =
1787          annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1788      sortedSet.clear();
1789      sortedSet.addAll(missingSet);
1790      storeAnnotations(type+".miss", sortedSet, keyDoc, errFileWriter);
1791      Set spuriousSet =
1792          annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1793      sortedSet.clear();
1794      sortedSet.addAll(spuriousSet);
1795      storeAnnotations(type+".spur", sortedSet, respDoc, errFileWriter);
1796      Set partialSet =
1797          annotDiffer.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
1798      sortedSet.clear();
1799      sortedSet.addAll(partialSet);
1800      storeAnnotations(type+".part", sortedSet, respDoc, errFileWriter);
1801    } catch (Exception ex) {
1802      Out.prln("Exception on close of error file "+errFileWriter+": "
1803               +ex.getMessage());
1804    }
1805  }// storeAnnotations
1806
1807  protected void storeAnnotations(String type, Set set, Document doc,
1808                                  Writer file) throws IOException{
1809
1810    if (set == null || set.isEmpty())
1811      return;
1812
1813    Iterator iter = set.iterator();
1814    Annotation ann;
1815    while (iter.hasNext()) {
1816      ann = (Annotation) iter.next();
1817      file.write(type);
1818      file.write(".");
1819      file.write(doc.getContent().toString().substring(
1820          ann.getStartNode().getOffset().intValue(),
1821          ann.getEndNode().getOffset().intValue()));
1822      file.write(".");
1823      file.write(ann.getStartNode().getOffset().toString());
1824      file.write(".");
1825      file.write(ann.getEndNode().getOffset().toString());
1826      file.write("\n");
1827    }//while
1828  }// storeAnnotations
1829
1830  protected void printAnnotations(AnnotationDiffer annotDiff,
1831                    Document keyDoc, Document respDoc) {
1832    Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
1833    Set missingSet =
1834      annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1835    printAnnotations(missingSet, keyDoc);
1836    Out.prln("<BR>");
1837
1838    Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
1839    Set spuriousSet =
1840      annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1841    printAnnotations(spuriousSet, respDoc);
1842    Out.prln("</BR>");
1843
1844    Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
1845    Set partialSet =
1846      annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
1847    printAnnotations(partialSet, respDoc);
1848  }
1849
1850  protected void printAnnotations(Set set, Document doc) {
1851    if (set == null || set.isEmpty())
1852      return;
1853
1854    Iterator iter = set.iterator();
1855    while (iter.hasNext()) {
1856      Annotation ann = (Annotation) iter.next();
1857      Out.prln(
1858        "<B>" +
1859        doc.getContent().toString().substring(
1860          ann.getStartNode().getOffset().intValue(),
1861          ann.getEndNode().getOffset().intValue()) +
1862        "</B>: <I>[" + ann.getStartNode().getOffset() +
1863        "," + ann.getEndNode().getOffset() + "]</I>"
1864//        + "; features" + ann.getFeatures()
1865        );
1866    }//while
1867  }//printAnnotations
1868
1869  /**
1870   * The directory from which we should generate/evaluate the corpus
1871   */
1872  private File startDir;
1873  private File currDir;
1874  private static List annotTypes;
1875
1876  private Controller application = null;
1877  private File applicationFile = null;
1878
1879  //collect the sum of all precisions and recalls of all docs
1880  //and the number of docs, so I can calculate the average for
1881  //the corpus at the end
1882  private double precisionSum = 0.0;
1883  private double recallSum = 0.0;
1884  private double fMeasureSum = 0.0;
1885  private HashMap precisionByType = new HashMap();
1886  private HashMap prCountByType = new HashMap();
1887  private HashMap recallByType = new HashMap();
1888  private HashMap recCountByType = new HashMap();
1889  private HashMap fMeasureByType = new HashMap();
1890  private HashMap fMeasureCountByType = new HashMap();
1891
1892  private HashMap missingByType = new HashMap();
1893  private HashMap spurByType = new HashMap();
1894  private HashMap correctByType = new HashMap();
1895  private HashMap partialByType = new HashMap();
1896
1897  // statistic for processed
1898  static boolean hasProcessed = false;
1899  private double proc_precisionSum = 0;
1900  private double proc_recallSum = 0;
1901  private double proc_fMeasureSum = 0;
1902  private HashMap proc_precisionByType = new HashMap();
1903  private HashMap proc_prCountByType = new HashMap();
1904  private HashMap proc_recallByType = new HashMap();
1905  private HashMap proc_recCountByType = new HashMap();
1906  private HashMap proc_fMeasureByType = new HashMap();
1907  private HashMap proc_fMeasureCountByType = new HashMap();
1908
1909  private HashMap proc_missingByType = new HashMap();
1910  private HashMap proc_spurByType = new HashMap();
1911  private HashMap proc_correctByType = new HashMap();
1912  private HashMap proc_partialByType = new HashMap();
1913
1914  double beta = 1;
1915
1916  private int docNumber = 0;
1917
1918  /**
1919   * If true, the corpus tool will generate the corpus, otherwise it'll
1920   * run in evaluate mode
1921   */
1922  private boolean isGenerateMode = false;
1923
1924  /**
1925   * If true - show annotations for docs below threshold
1926   */
1927  private boolean isVerboseMode = false;
1928
1929  /**
1930   * If true - show more info in document table
1931   */
1932  private boolean isMoreInfoMode = false;
1933
1934  /**
1935   * The list of features used in the AnnotationDiff separated by comma
1936   * Example: "class;inst"
1937   */
1938  private Set diffFeaturesSet;
1939
1940  /**
1941   * If true, the corpus tool will evaluate stored against the human-marked
1942   * documents
1943   */
1944  private boolean isMarkedStored = false;
1945  private boolean isMarkedClean = false;
1946  //whether marked are in a DS, not xml
1947  private boolean isMarkedDS = false;
1948
1949  private String annotSetName = "Key";
1950  private String outputSetName = null;
1951
1952  private double threshold = 0.5;
1953  private Properties configs = new Properties();
1954  private static int corpusWordCount = 0;
1955
1956  private String documentEncoding = "";
1957
1958  /** String to print when wrong command-line args */
1959  private static String usage =
1960    "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
1961    +"[-verbose] [-moreinfo] directory-name application";
1962
1963}
1964