1   /*
2    *  CorpusBenchmarkTool.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/Oct/2001
12   *
13   *  $Id: CorpusBenchmarkTool.java,v 1.27 2002/07/02 17:50:44 kalina Exp $
14   */
15  
16  package gate.util;
17  
18  import java.util.*;
19  import java.io.*;
20  
21  import gate.*;
22  import gate.creole.*;
23  import gate.util.*;
24  import gate.persist.*;
25  import gate.creole.tokeniser.*;
26  import gate.creole.gazetteer.*;
27  import gate.creole.splitter.*;
28  import gate.creole.orthomatcher.*;
29  import gate.creole.annotransfer.*;
30  import gate.annotation.*;
31  
32  public class CorpusBenchmarkTool {
33    private static final String MARKED_DIR_NAME = "marked";
34    private static final String CLEAN_DIR_NAME = "clean";
35    private static final String CVS_DIR_NAME = "Cvs";
36    private static final String PROCESSED_DIR_NAME = "processed";
37  
38    private static final boolean DEBUG = true;
39  
40    public CorpusBenchmarkTool() {}
41  
42    public void initPRs() {
43      try {
44        if (applicationFile == null)
45          Out.prln("Application not set!");
46        Out.prln("App file is: " + applicationFile.getAbsolutePath());
47        application = (Controller) gate.util.persistence.PersistenceManager
48                                     .loadObjectFromFile(applicationFile);
49      } catch (Exception ex) {
50        throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
51      }
52    }//initPRs
53  
54    public void unloadPRs() {
55      //we have nothing to unload if no PRs are loaded
56      if (isMarkedStored)
57        return;
58  
59    }
60  
61    public void execute() {
62      execute(startDir);
63      if (application != null) {
64        Iterator iter = new ArrayList(application.getPRs()).iterator();
65        while (iter.hasNext())
66          Factory.deleteResource((Resource) iter.next());
67        Factory.deleteResource(application);
68      }
69    }
70  
71    public void init() {
72      //first read the corpus_tool.properties file
73      File propFile = new File("corpus_tool.properties");
74      Out.prln(propFile.getAbsolutePath());
75      if (propFile.exists()) {
76        try {
77          InputStream inputStream = new FileInputStream(propFile);
78          this.configs.load(inputStream);
79          String thresholdString = this.configs.getProperty("threshold");
80          if (thresholdString != null && !thresholdString.equals("")) {
81            this.threshold = (new Double(thresholdString)).doubleValue();
82            Out.prln("New threshold is: " + this.threshold + "<P>\n");
83          }
84          String setName = this.configs.getProperty("annotSetName");
85          if (setName != null && !setName.equals("")) {
86            Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
87            this.annotSetName = setName;
88          }
89          setName = this.configs.getProperty("outputSetName");
90          if (setName != null && !setName.equals("")) {
91            Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
92            this.outputSetName = setName;
93          }
94          String types = this.configs.getProperty("annotTypes");
95          if (types != null && !types.equals("")) {
96            Out.prln("Using annotation types from the properties file. <P>\n");
97            StringTokenizer strTok = new StringTokenizer(types, ";");
98            annotTypes = new ArrayList();
99            while (strTok.hasMoreTokens())
100             annotTypes.add(strTok.nextToken());
101         } else {
102           annotTypes = new ArrayList();
103           annotTypes.add("Organization");
104           annotTypes.add("Person");
105           annotTypes.add("Date");
106           annotTypes.add("Location");
107           annotTypes.add("Address");
108           annotTypes.add("Money");
109           annotTypes.add("Percent");
110           annotTypes.add("GPE");
111           annotTypes.add("Facility");
112         }
113 
114       } catch (IOException ex) {
115         //just ignore the file and go on with the defaults
116         this.configs = new Properties();
117       }
118     } else
119       this.configs = new Properties();
120 
121 
122     //we only initialise the PRs if they are going to be used
123     //for processing unprocessed documents
124     if (!this.isMarkedStored)
125       initPRs();
126 
127   }
128 
129   public void execute(File dir) {
130     if (dir == null)
131       return;
132     //first set the current directory to be the given one
133     currDir = dir;
134     Out.prln("Processing directory: " + currDir + "<P>");
135 
136     File processedDir = null;
137     File cleanDir = null;
138     File markedDir = null;
139 
140     ArrayList subDirs = new ArrayList();
141     File[] dirArray = currDir.listFiles();
142     for (int i = 0; i < dirArray.length; i++) {
143       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
144         continue;
145       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
146         cleanDir = dirArray[i];
147       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
148         markedDir = dirArray[i];
149       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
150         processedDir = dirArray[i];
151       else
152         subDirs.add(dirArray[i]);
153     }
154 
155     if (this.isGenerateMode)
156       generateCorpus(cleanDir, processedDir);
157     else
158       evaluateCorpus(cleanDir, processedDir, markedDir);
159 
160     //if no more subdirs left, return
161     if (subDirs.isEmpty())
162       return;
163 
164     //there are more subdirectories to traverse, so iterate through
165     for (int j = 0; j < subDirs.size(); j++)
166       execute((File) subDirs.get(j));
167 
168   }//execute(dir)
169 
170 
171   public static void main(String[] args) throws GateException {
172     Out.prln("<HTML>");
173     Out.prln("<HEAD>");
174     Out.prln("<TITLE> Corpus benchmark tool: ran with args " +
175             args.toString() + " on " +
176             new Date() + "</TITLE> </HEAD>");
177     Out.prln("<BODY>");
178     Out.prln("Please wait while GATE tools are initialised. <P>");
179     // initialise GATE
180     Gate.init();
181 
182     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
183 
184     List inputFiles = null;
185     if(args.length < 1) throw new GateException(usage);
186     int i = 0;
187     while (i < args.length && args[i].startsWith("-")) {
188       if(args[i].equals("-generate")) {
189         Out.prln("Generating the corpus... <P>");
190         corpusTool.setGenerateMode(true);
191       } else if (args[i].equals("-marked_clean")) {
192         Out.prln("Evaluating current grammars against human-annotated...<P>");
193         corpusTool.setMarkedClean(true);
194       } else if (args[i].equals("-marked_stored")) {
195         Out.prln("Evaluating stored documents against human-annotated...<P>");
196         corpusTool.setMarkedStored(true);
197       } else if (args[i].equals("-marked_ds")) {
198         Out.prln("Looking for marked docs in a datastore...<P>");
199         corpusTool.setMarkedDS(true);
200       } else if (args[i].equals("-verbose")) {
201         Out.prln("Running in verbose mode. Will generate annotation " +
202           "information when precision/recall are lower than " +
203           corpusTool.getThreshold() +"<P>");
204         corpusTool.setVerboseMode(true);
205       }
206       i++; //just ignore the option, which we do not recognise
207     }//while
208 
209     String dirName = args[i];
210     File dir = new File(dirName);
211     if (!dir.isDirectory())
212       throw new GateException(usage);
213 
214     //get the last argument which is the application
215     i++;
216     String appName = args[i];
217     File appFile = new File(appName);
218     if (!appFile.isFile())
219       throw new GateException(usage);
220     else
221       corpusTool.setApplicationFile(appFile);
222 
223     corpusTool.init();
224 
225     Out.prln("Measuring annotaitions of types: " + corpusTool.annotTypes + "<P>");
226 
227     corpusTool.setStartDirectory(dir);
228     corpusTool.execute();
229 
230     //if we're not generating the corpus, then print the precision and recall
231     //statistics for the processed corpus
232     if (! corpusTool.getGenerateMode())
233       corpusTool.printStatistics();
234 
235     Out.prln("Finished! <P>");
236     Out.prln("</BODY>");
237     Out.prln("</HTML>");
238 
239     System.exit(0);
240 
241   }//main
242 
243   public void setGenerateMode(boolean mode) {
244     isGenerateMode = mode;
245   }//setGenerateMode
246 
247   public boolean getGenerateMode() {
248     return isGenerateMode;
249   }//getGenerateMode
250 
251   public boolean getVerboseMode() {
252     return isVerboseMode;
253   }//getVerboseMode
254 
255   public void setVerboseMode(boolean mode) {
256     isVerboseMode = mode;
257   }//setVerboseMode
258 
259   public void setMarkedStored(boolean mode) {
260     isMarkedStored = mode;
261   }//
262 
263   public boolean getMarkedStored() {
264     return isMarkedStored;
265   }//
266 
267   public void setMarkedClean(boolean mode) {
268     isMarkedClean = mode;
269   }//
270 
271   public boolean getMarkedClean() {
272     return isMarkedClean;
273   }//
274 
275   public void setMarkedDS(boolean mode) {
276     isMarkedDS = mode;
277   }//
278 
279   public boolean getMarkedDS() {
280     return isMarkedDS;
281   }//
282 
283   public void setApplicationFile(File newAppFile) {
284     applicationFile = newAppFile;
285   }
286 
287   /**
288    * Returns the average precision over the entire set of processed documents.
289    * <P>
290    * If the tool has been evaluating the original documents against the
291    * previously-stored automatically annotated ones, then the precision
292    * will be the average precision on those two sets. <P>
293    * If the tool was run in -marked mode, i.e., was evaluating the stored
294    * automatically processed ones against the human-annotated ones, then
295    * the precision will be the average precision on those two sets of documents.
296    */
297   public double getPrecisionAverage() {
298     return precisionSum/docNumber;
299   }
300 
301   /**
302    * Returns the average recall over the entire set of processed documents.
303    * <P>
304    * If the tool has been evaluating the original documents against the
305    * previously-stored automatically annotated ones, then the recall
306    * will be the average recall on those two sets. <P>
307    * If the tool was run in -marked mode, i.e., was evaluating the stored
308    * automatically processed ones against the human-annotated ones, then
309    * the recall will be the average recall on those two sets of documents.
310    */
311   public double getRecallAverage() {
312     return recallSum/docNumber;
313   }
314 
315   public boolean isGenerateMode() {
316     return isGenerateMode == true;
317   }//isGenerateMode
318 
319   public double getThreshold() {
320     return threshold;
321   }
322 
323   public void setThreshold(double newValue) {
324     threshold = newValue;
325   }
326 
327   public File getStartDirectory() {
328     return startDir;
329   }//getStartDirectory
330 
331   public void setStartDirectory(File dir) {
332     startDir = dir;
333   }//setStartDirectory
334 
335   protected void generateCorpus(File fileDir, File outputDir) {
336     //1. check if we have input files
337     if (fileDir == null)
338       return;
339     //2. create the output directory or clean it up if needed
340     File outDir = outputDir;
341     if (outputDir == null) {
342       outDir = new File(currDir, PROCESSED_DIR_NAME);
343     } else {
344       // get rid of the directory, coz datastore wants it clean
345       if (!Files.rmdir(outDir))
346         Out.prln("cannot delete old output directory: " + outDir);
347     }
348     outDir.mkdir();
349 
350     //create the datastore and process each document
351     try {
352       SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
353       sds.create();
354       sds.open();
355 
356       File[] files = fileDir.listFiles();
357       for (int i=0; i < files.length; i++) {
358         if (!files[i].isFile())
359           continue;
360         // create a document
361         Out.prln("Processing and storing document: " + files[i].toURL() +"<P>");
362 
363         FeatureMap params = Factory.newFeatureMap();
364         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
365         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
366 
367         // create the document
368         Document doc = (Document) Factory.createResource(
369           "gate.corpora.DocumentImpl", params
370         );
371 
372         doc.setName(files[i].getName());
373         if (doc == null)
374           continue;
375         processDocument(doc);
376         LanguageResource lr = sds.adopt(doc, null);
377         sds.sync(lr);
378         Factory.deleteResource(doc);
379         Factory.deleteResource(lr);
380       }//for
381       sds.close();
382     } catch (java.net.MalformedURLException ex) {
383       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
384     } catch (PersistenceException ex1) {
385       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
386     } catch (ResourceInstantiationException ex2) {
387       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
388     } catch (gate.security.SecurityException ex3) {
389       throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage());
390     }
391 
392   }//generateCorpus
393 
394   protected void evaluateCorpus(File fileDir,
395                     File processedDir, File markedDir) {
396     //1. check if we have input files and the processed Dir
397     if (fileDir == null || !fileDir.exists())
398       return;
399     if (processedDir == null || !processedDir.exists())
400       //if the user wants evaluation of marked and stored that's not possible
401       if (isMarkedStored) {
402         Out.prln("Cannot evaluate because no processed documents exist.");
403         return;
404       }
405       else
406         isMarkedClean = true;
407 
408     //looked for marked texts only if the directory exists
409     boolean processMarked = markedDir != null && markedDir.exists();
410     if (!processMarked && (isMarkedStored || isMarkedClean)) {
411         Out.prln("Cannot evaluate because no human-annotated documents exist.");
412         return;
413     }
414 
415     if (isMarkedStored) {
416       evaluateMarkedStored(markedDir, processedDir);
417       return;
418     } else if (isMarkedClean) {
419       evaluateMarkedClean(markedDir, fileDir);
420       return;
421     }
422 
423     Document persDoc = null;
424     Document cleanDoc = null;
425     Document markedDoc = null;
426 
427     //open the datastore and process each document
428     try {
429       //open the data store
430       DataStore sds = Factory.openDataStore
431                       ("gate.persist.SerialDataStore",
432                        processedDir.toURL().toExternalForm());
433 
434       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
435       for (int i=0; i < lrIDs.size(); i++) {
436         String docID = (String) lrIDs.get(i);
437 
438         //read the stored document
439         FeatureMap features = Factory.newFeatureMap();
440         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
441         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
442         persDoc = (Document) Factory.createResource(
443                                     "gate.corpora.DocumentImpl",
444                                     features);
445 
446         Out.prln("<H2>" + persDoc.getName() + "</H2>");
447 
448         File cleanDocFile = new File(fileDir, persDoc.getName());
449         //try reading the original document from clean
450         if (! cleanDocFile.exists()) {
451           Out.prln("Warning: Cannot find original document " +
452                    persDoc.getName() + " in " + fileDir);
453         } else {
454           FeatureMap params = Factory.newFeatureMap();
455           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
456           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
457 
458           // create the document
459           cleanDoc = (Document) Factory.createResource(
460                                   "gate.corpora.DocumentImpl", params);
461           cleanDoc.setName(persDoc.getName());
462         }
463 
464         //try finding the marked document
465         StringBuffer docName = new StringBuffer(persDoc.getName());
466         if (! isMarkedDS) {
467           docName.replace(
468             persDoc.getName().lastIndexOf("."),
469             docName.length(),
470             ".xml");
471           File markedDocFile = new File(markedDir, docName.toString());
472           if (! processMarked || ! markedDocFile.exists()) {
473             Out.prln("Warning: Cannot find human-annotated document " +
474                      markedDocFile + " in " + markedDir);
475           } else {
476             FeatureMap params = Factory.newFeatureMap();
477             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
478             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
479 
480             // create the document
481             markedDoc = (Document) Factory.createResource(
482                                      "gate.corpora.DocumentImpl", params);
483             markedDoc.setName(persDoc.getName());
484           }
485         } else {
486           //open marked from a DS
487           //open the data store
488           DataStore sds1 = Factory.openDataStore
489                           ("gate.persist.SerialDataStore",
490                            markedDir.toURL().toExternalForm());
491 
492           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
493           boolean found = false;
494           int k = 0;
495           //search for the marked doc with the same name
496           while (k < lrIDs1.size() && !found) {
497             String docID1 = (String) lrIDs1.get(k);
498 
499             //read the stored document
500             FeatureMap features1 = Factory.newFeatureMap();
501             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
502             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
503             Document tempDoc = (Document) Factory.createResource(
504                                         "gate.corpora.DocumentImpl",
505                                         features1);
506             //check whether this is our doc
507             if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
508                  endsWith(persDoc.getName())) {
509               found = true;
510               markedDoc = tempDoc;
511             } else k++;
512           }
513         }
514 
515         evaluateDocuments(persDoc, cleanDoc, markedDoc);
516         if (persDoc != null)
517           Factory.deleteResource(persDoc);
518         if (cleanDoc != null)
519           Factory.deleteResource(cleanDoc);
520         if (markedDoc != null)
521           Factory.deleteResource(markedDoc);
522 
523       }//for loop through saved docs
524       sds.close();
525     } catch (java.net.MalformedURLException ex) {
526       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
527     } catch (PersistenceException ex1) {
528       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
529     } catch (ResourceInstantiationException ex2) {
530       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
531     }
532 
533   }//evaluateCorpus
534 
535   protected void evaluateMarkedStored(File markedDir, File storedDir) {
536     Document persDoc = null;
537     Document cleanDoc = null;
538     Document markedDoc = null;
539 
540     //open the datastore and process each document
541     try {
542       //open the data store
543       DataStore sds = Factory.openDataStore
544                       ("gate.persist.SerialDataStore",
545                        storedDir.toURL().toExternalForm());
546 
547       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
548       for (int i=0; i < lrIDs.size(); i++) {
549         String docID = (String) lrIDs.get(i);
550 
551         //read the stored document
552         FeatureMap features = Factory.newFeatureMap();
553         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
554         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
555         persDoc = (Document) Factory.createResource(
556                                     "gate.corpora.DocumentImpl",
557                                     features);
558 
559         Out.prln("<H2>" + persDoc.getName() + "</H2>");
560 
561         if (! this.isMarkedDS) { //try finding the marked document as file
562           StringBuffer docName = new StringBuffer(persDoc.getName());
563           docName.replace(
564             persDoc.getName().lastIndexOf("."),
565             docName.length(),
566             ".xml");
567           File markedDocFile = new File(markedDir, docName.toString());
568           if (! markedDocFile.exists()) {
569             Out.prln("Warning: Cannot find human-annotated document " +
570                      markedDocFile + " in " + markedDir);
571           } else {
572             FeatureMap params = Factory.newFeatureMap();
573             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
574             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
575 
576             // create the document
577             markedDoc = (Document) Factory.createResource(
578                                      "gate.corpora.DocumentImpl", params);
579             markedDoc.setName(persDoc.getName());
580           }//find marked as file
581         } else {
582           try {
583             //open marked from a DS
584             //open the data store
585             DataStore sds1 = Factory.openDataStore
586                             ("gate.persist.SerialDataStore",
587                              markedDir.toURL().toExternalForm());
588 
589             List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
590             boolean found = false;
591             int k = 0;
592             //search for the marked doc with the same name
593             while (k < lrIDs1.size() && !found) {
594               String docID1 = (String) lrIDs1.get(k);
595 
596               //read the stored document
597               FeatureMap features1 = Factory.newFeatureMap();
598               features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
599               features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
600               Document tempDoc = (Document) Factory.createResource(
601                                           "gate.corpora.DocumentImpl",
602                                           features1);
603               //check whether this is our doc
604               if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
605                    endsWith(persDoc.getName())) {
606                 found = true;
607                 markedDoc = tempDoc;
608               } else k++;
609             }
610           } catch (java.net.MalformedURLException ex) {
611             Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
612           } catch (gate.persist.PersistenceException ex1) {
613             Out.prln("Error opening marked as a datastore (-marked_ds specified)");
614           } catch (gate.creole.ResourceInstantiationException ex2) {
615             Out.prln("Error opening marked as a datastore (-marked_ds specified)");
616           }
617         }
618 
619         evaluateDocuments(persDoc, cleanDoc, markedDoc);
620         if (persDoc != null)
621           Factory.deleteResource(persDoc);
622         if (markedDoc != null)
623           Factory.deleteResource(markedDoc);
624 
625       }//for loop through saved docs
626       sds.close();
627 
628     } catch (java.net.MalformedURLException ex) {
629       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
630     } catch (PersistenceException ex1) {
631       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
632     } catch (ResourceInstantiationException ex2) {
633       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
634     }
635 
636   }//evaluateMarkedStored
637 
638 
639   protected void evaluateMarkedClean(File markedDir, File cleanDir) {
640     Document persDoc = null;
641     Document cleanDoc = null;
642     Document markedDoc = null;
643 
644     File[] cleanDocs = cleanDir.listFiles();
645     for (int i = 0; i< cleanDocs.length; i++) {
646       if (!cleanDocs[i].isFile())
647         continue;
648 
649       //try reading the original document from clean
650       FeatureMap params = Factory.newFeatureMap();
651       try {
652         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
653       } catch (java.net.MalformedURLException ex) {
654         Out.prln("Cannot create document from file: " +
655           cleanDocs[i].getAbsolutePath());
656         continue;
657       }
658       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
659 
660       // create the document
661       try {
662         cleanDoc = (Document) Factory.createResource(
663                               "gate.corpora.DocumentImpl", params,
664                               null, cleanDocs[i].getName());
665       } catch (gate.creole.ResourceInstantiationException ex) {
666         Out.prln("Cannot create document from file: " +
667           cleanDocs[i].getAbsolutePath());
668         continue;
669       }
670 
671       Out.prln("<TD>" + cleanDocs[i].getName() + "</TD>");
672 
673       //try finding the marked document
674       if (! isMarkedDS) {
675         StringBuffer docName = new StringBuffer(cleanDoc.getName());
676         docName.replace(
677           cleanDoc.getName().lastIndexOf("."),
678           docName.length(),
679           ".xml");
680         File markedDocFile = new File(markedDir, docName.toString());
681         if (! markedDocFile.exists()) {
682           Out.prln("Warning: Cannot find human-annotated document " +
683                    markedDocFile + " in " + markedDir);
684           continue;
685         } else {
686           params = Factory.newFeatureMap();
687           try {
688             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
689           } catch (java.net.MalformedURLException ex) {
690             Out.prln("Cannot create document from file: " +
691               markedDocFile.getAbsolutePath());
692             continue;
693           }
694           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
695 
696           // create the document
697           try {
698             markedDoc = (Document) Factory.createResource(
699                                    "gate.corpora.DocumentImpl", params,
700                                    null, cleanDoc.getName());
701           } catch (gate.creole.ResourceInstantiationException ex) {
702             Out.prln("Cannot create document from file: " +
703               markedDocFile.getAbsolutePath());
704             continue;
705           }
706 
707         }//if markedDoc exists
708       } else {
709         try {
710           //open marked from a DS
711           //open the data store
712           DataStore sds1 = Factory.openDataStore
713                           ("gate.persist.SerialDataStore",
714                            markedDir.toURL().toExternalForm());
715 
716           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
717           boolean found = false;
718           int k = 0;
719           //search for the marked doc with the same name
720           while (k < lrIDs1.size() && !found) {
721             String docID1 = (String) lrIDs1.get(k);
722 
723             //read the stored document
724             FeatureMap features1 = Factory.newFeatureMap();
725             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
726             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
727             Document tempDoc = (Document) Factory.createResource(
728                                         "gate.corpora.DocumentImpl",
729                                         features1);
730             //check whether this is our doc
731             if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
732                  endsWith(cleanDoc.getName())) {
733               found = true;
734               markedDoc = tempDoc;
735             } else k++;
736           }
737         } catch (java.net.MalformedURLException ex) {
738           Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
739         } catch (gate.persist.PersistenceException ex1) {
740           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
741         } catch (gate.creole.ResourceInstantiationException ex2) {
742           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
743         }
744       } //if using a DS for marked
745 
746       try {
747         evaluateDocuments(persDoc, cleanDoc, markedDoc);
748       } catch (gate.creole.ResourceInstantiationException ex) {
749 ex.printStackTrace();
750         Out.prln("Evaluate failed on document: " + cleanDoc.getName());
751       }
752       if (persDoc != null)
753         Factory.deleteResource(persDoc);
754       if (cleanDoc != null)
755         Factory.deleteResource(cleanDoc);
756       if (markedDoc != null)
757         Factory.deleteResource(markedDoc);
758 
759     }//for loop through clean docs
760 
761 
762   }//evaluateMarkedClean
763 
764   protected void processDocument(Document doc) {
765     try {
766       if (application instanceof CorpusController) {
767         Corpus tempCorpus = Factory.newCorpus("temp");
768         tempCorpus.add(doc);
769         ((CorpusController)application).setCorpus(tempCorpus);
770         application.execute();
771         Factory.deleteResource(tempCorpus);
772         tempCorpus = null;
773       } else {
774         Iterator iter = application.getPRs().iterator();
775         while (iter.hasNext())
776           ((ProcessingResource) iter.next()).setParameterValue("document", doc);
777         application.execute();
778       }
779     } catch (ResourceInstantiationException ex) {
780       throw new RuntimeException("Error executing application: "
781                                     + ex.getMessage());
782     } catch (ExecutionException ex) {
783       throw new RuntimeException("Error executing application: "
784                                     + ex.getMessage());
785     }
786   }
787 
788   protected void evaluateDocuments(Document persDoc,
789                     Document cleanDoc, Document markedDoc)
790                         throws ResourceInstantiationException {
791     if (cleanDoc == null && markedDoc == null)
792       return;
793 
794     //we've got no types to compare
795     if (annotTypes == null || annotTypes.isEmpty())
796       return;
797 
798     if (cleanDoc != null && !isMarkedStored) {
799 
800       processDocument(cleanDoc);
801 
802       if(!isMarkedClean)
803         evaluateAllThree(persDoc, cleanDoc, markedDoc);
804       else
805         evaluateTwoDocs(markedDoc, cleanDoc);
806 
807     } else
808       evaluateTwoDocs(markedDoc, persDoc);
809 
810   }
811 
812   protected void evaluateAllThree(Document persDoc,
813                                   Document cleanDoc, Document markedDoc)
814                                   throws ResourceInstantiationException {
815     //first start the table and its header
816     printTableHeader();
817     for (int jj= 0; jj< annotTypes.size(); jj++) {
818       String annotType = (String) annotTypes.get(jj);
819 
820       AnnotationDiff annotDiff=measureDocs(markedDoc, cleanDoc, annotType);
821       //we don't have this annotation type in this document
822       if (annotDiff == null)
823         continue;
824       Out.prln("<TR>");
825 
826       //increase the number of processed documents
827       docNumber++;
828       //add precison and recall to the sums
829       updateStatistics(annotDiff, annotType);
830 
831       Out.prln("<TD> Annotation type: " + annotType + "</TD>");
832 
833       AnnotationDiff annotDiff1 =
834         measureDocs(markedDoc, persDoc, annotType);
835 
836       Out.prln("<TD>" + annotDiff.getPrecisionAverage());
837       //check the precision first
838       if (annotDiff1 != null &&
839           annotDiff!= null &&
840           annotDiff1.getPrecisionAverage()<annotDiff.getPrecisionAverage()
841          )
842         Out.prln("<P> Precision increase on human-marked from " +
843                  annotDiff1.getPrecisionAverage() + " to " +
844                  annotDiff.getPrecisionAverage() + "</P>");
845       else if (annotDiff1 != null
846                && annotDiff != null
847                && annotDiff1.getPrecisionAverage()
848                    > annotDiff.getPrecisionAverage())
849         Out.prln("<P> Precision decrease on human-marked from " +
850                  annotDiff1.getPrecisionAverage() + " to " +
851                  annotDiff.getPrecisionAverage() + "</P>");
852       Out.prln("</TD>");
853 
854       Out.prln("<TD>" + annotDiff.getRecallAverage());
855       //check the recall now
856       if (annotDiff1 != null &&
857           annotDiff!= null &&
858           annotDiff1.getRecallAverage()<annotDiff.getRecallAverage()
859          )
860         Out.prln("<P> Recall increase on human-marked from " +
861                  annotDiff1.getRecallAverage() + " to " +
862                  annotDiff.getRecallAverage() + "</P>");
863       else if (annotDiff1 != null
864                && annotDiff != null
865                && annotDiff1.getRecallAverage()
866                    > annotDiff.getRecallAverage())
867         Out.prln("<P> Recall decrease on human-marked from " +
868                  annotDiff1.getRecallAverage() + " to " +
869                  annotDiff.getRecallAverage() + "</P>");
870 
871       Out.prln("</TD>");
872 
873       //check the recall now
874       if ( isVerboseMode
875            &&
876            ((annotDiff.getRecallAverage() < threshold
877              ||
878              annotDiff.getRecallAverage() < threshold)
879            )
880          )
881         printAnnotations(annotDiff, markedDoc, cleanDoc);
882 
883 
884       Out.prln("</TR>");
885     }//for loop through annotation types
886     Out.prln("</TABLE>");
887 
888   }//evaluateAllThree
889 
890   protected void evaluateTwoDocs(Document keyDoc, Document respDoc)
891         throws ResourceInstantiationException {
892 
893     //first start the table and its header
894     printTableHeader();
895     for (int jj= 0; jj< annotTypes.size(); jj++) {
896       String annotType = (String) annotTypes.get(jj);
897 
898       AnnotationDiff annotDiff=measureDocs(keyDoc, respDoc, annotType);
899       //we don't have this annotation type in this document
900       if (annotDiff == null)
901         continue;
902       Out.prln("<TR>");
903 
904       //increase the number of processed documents
905       docNumber++;
906       //add precison and recall to the sums
907       updateStatistics(annotDiff, annotType);
908 
909       Out.prln("<TD>" + annotType + "</TD>");
910 
911       Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
912       Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
913       //check the recall now
914       if ( isVerboseMode
915            &&
916            ((annotDiff.getRecallAverage() < threshold
917              ||
918              annotDiff.getRecallAverage() < threshold)
919            )
920          )
921         printAnnotations(annotDiff, keyDoc, respDoc);
922 
923       Out.prln("</TR>");
924     }//for loop through annotation types
925     Out.prln("</TABLE>");
926 
927   }//evaluateTwoDocs
928 
929   protected void printTableHeader() {
930     Out.prln("<TABLE BORDER=1");
931     if (isVerboseMode)
932       Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> "
933               + "<TD><B>Recall</B></TD> <TD><B>Annotations<B></TD>");
934     else
935       Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> "
936               + "<TD><B>Recall</B></TD>");
937   }
938 
939   protected void updateStatistics(AnnotationDiff annotDiff, String annotType){
940       precisionSum += annotDiff.getPrecisionAverage();
941       recallSum += annotDiff.getRecallAverage();
942       fMeasureSum += annotDiff.getFMeasureAverage();
943       Double oldPrecision = (Double) precisionByType.get(annotType);
944       if (oldPrecision == null)
945         precisionByType.put(annotType,
946                             new Double(annotDiff.getPrecisionAverage()));
947       else
948         precisionByType.put(annotType,
949                             new Double(oldPrecision.doubleValue() +
950                                        annotDiff.getPrecisionAverage()));
951       Integer precCount = (Integer) prCountByType.get(annotType);
952       if (precCount == null)
953         prCountByType.put(annotType, new Integer(1));
954       else
955         prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
956 
957 
958       Double oldFMeasure = (Double) fMeasureByType.get(annotType);
959       if (oldFMeasure == null)
960         fMeasureByType.put(annotType,
961                          new Double(annotDiff.getFMeasureAverage()));
962       else
963         fMeasureByType.put(annotType,
964                          new Double(oldFMeasure.doubleValue() +
965                                     annotDiff.getFMeasureAverage()));
966       Integer fCount = (Integer) fMeasureCountByType.get(annotType);
967       if (fCount == null)
968         fMeasureCountByType.put(annotType, new Integer(1));
969       else
970         fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
971 
972               Double oldRecall = (Double) recallByType.get(annotType);
973       if (oldRecall == null)
974         recallByType.put(annotType,
975                             new Double(annotDiff.getRecallAverage()));
976       else
977         recallByType.put(annotType,
978                             new Double(oldRecall.doubleValue() +
979                                        annotDiff.getRecallAverage()));
980       Integer recCount = (Integer) recCountByType.get(annotType);
981       if (recCount == null)
982         recCountByType.put(annotType, new Integer(1));
983       else
984         recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
985   }
986 
987   protected void printStatistics() {
988 
989     Out.prln("<H2> Statistics </H2>");
990     Out.prln("<H3> Precision </H3>");
991     if (precisionByType != null && !precisionByType.isEmpty()) {
992       Iterator iter = precisionByType.keySet().iterator();
993       while (iter.hasNext()) {
994         String annotType = (String) iter.next();
995         Out.prln(annotType + ": "
996           + ((Double)precisionByType.get(annotType)).doubleValue()
997               /
998               ((Integer)prCountByType.get(annotType)).intValue()
999           + "<P>");
1000      }//while
1001    }
1002    Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
1003
1004    Out.prln("<H3> Recall </H3>");
1005    if (recallByType != null && !recallByType.isEmpty()) {
1006      Iterator iter = recallByType.keySet().iterator();
1007      while (iter.hasNext()) {
1008        String annotType = (String) iter.next();
1009        Out.prln(annotType + ": "
1010          + ((Double)recallByType.get(annotType)).doubleValue()
1011              /
1012              ((Integer)recCountByType.get(annotType)).intValue()
1013          + "<P>");
1014      }//while
1015    }
1016
1017    Out.prln("Overall recall: " + getRecallAverage()
1018             + "<P>");
1019
1020    Out.prln("<H3> F-Measure </H3>");
1021    if (fMeasureByType != null && !fMeasureByType.isEmpty()) {
1022      Iterator iter = fMeasureByType.keySet().iterator();
1023      while (iter.hasNext()) {
1024        String annotType = (String) iter.next();
1025        Out.prln(annotType + ": "
1026          + ((Double)fMeasureByType.get(annotType)).doubleValue()
1027              /
1028              ((Integer)fMeasureCountByType.get(annotType)).intValue()
1029          + "<P>");
1030      }//while
1031    }
1032
1033    Out.prln("Overall average fMeasure: " + fMeasureSum/docNumber
1034             + "<P>");
1035
1036  }
1037
1038  protected AnnotationDiff measureDocs(
1039    Document keyDoc, Document respDoc, String annotType)
1040      throws ResourceInstantiationException {
1041
1042    if (keyDoc == null || respDoc == null)
1043      return null;
1044
1045    if (annotSetName != null
1046        && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1047      return null;
1048    else if ((annotSetName == null || annotSetName.equals(""))
1049        && keyDoc.getAnnotations().get(annotType) == null)
1050      return null;
1051
1052    // create the annotation schema needed for AnnotationDiff
1053    AnnotationSchema annotationSchema = new AnnotationSchema();
1054
1055    // set annotation type
1056    annotationSchema.setAnnotationName(annotType);
1057    // create an annotation diff
1058    AnnotationDiff annotDiff = new AnnotationDiff();
1059    annotDiff.setAnnotationSchema(annotationSchema);
1060    annotDiff.setKeyDocument(keyDoc);
1061    annotDiff.setResponseDocument(respDoc);
1062    annotDiff.setKeyAnnotationSetName(annotSetName);
1063    annotDiff.setResponseAnnotationSetName(outputSetName);
1064    annotDiff.setKeyFeatureNamesSet(new HashSet());
1065    annotDiff.setTextMode(new Boolean(true));
1066    annotDiff.init();
1067
1068    return annotDiff;
1069  }
1070
1071  protected void printAnnotations(AnnotationDiff annotDiff,
1072                    Document keyDoc, Document respDoc) {
1073    Out.prln("<TD>");
1074    Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
1075    Set missingSet =
1076      annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE);
1077    printAnnotations(missingSet, keyDoc);
1078    Out.prln("<BR>");
1079
1080    Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
1081    Set spuriousSet =
1082      annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE);
1083    printAnnotations(spuriousSet, respDoc);
1084    Out.prln("</BR>");
1085
1086    Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
1087    Set partialSet =
1088      annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE);
1089    printAnnotations(partialSet, respDoc);
1090    Out.prln("</TD>");
1091
1092  }
1093
1094  protected void printAnnotations(Set set, Document doc) {
1095    if (set == null || set.isEmpty())
1096      return;
1097
1098    Iterator iter = set.iterator();
1099    while (iter.hasNext()) {
1100      Annotation ann = (Annotation) iter.next();
1101      Out.prln(
1102        "<B>" +
1103        doc.getContent().toString().substring(
1104          ann.getStartNode().getOffset().intValue(),
1105          ann.getEndNode().getOffset().intValue()) +
1106        "</B>: <I>[" + ann.getStartNode().getOffset() +
1107        "," + ann.getEndNode().getOffset() + "]</I>"
1108//        + "; features" + ann.getFeatures()
1109        );
1110    }//while
1111  }
1112
1113  /**
1114   * The directory from which we should generate/evaluate the corpus
1115   */
1116  private File startDir;
1117  private File currDir;
1118  private static List annotTypes;
1119
1120  private Controller application = null;
1121  private File applicationFile = null;
1122
1123  //collect the sum of all precisions and recalls of all docs
1124  //and the number of docs, so I can calculate the average for
1125  //the corpus at the end
1126  private double precisionSum = 0;
1127  private double recallSum = 0;
1128  private double fMeasureSum = 0;
1129  private HashMap precisionByType = new HashMap();
1130  private HashMap prCountByType = new HashMap();
1131  private HashMap recallByType = new HashMap();
1132  private HashMap recCountByType = new HashMap();
1133  private HashMap fMeasureByType = new HashMap();
1134  private HashMap fMeasureCountByType = new HashMap();
1135  private int docNumber = 0;
1136
1137  /**
1138   * If true, the corpus tool will generate the corpus, otherwise it'll
1139   * run in evaluate mode
1140   */
1141  private boolean isGenerateMode = false;
1142  private boolean isVerboseMode = false;
1143
1144  /**
1145   * If true, the corpus tool will evaluate stored against the human-marked
1146   * documents
1147   */
1148  private boolean isMarkedStored = false;
1149  private boolean isMarkedClean = false;
1150  //whether marked are in a DS, not xml
1151  private boolean isMarkedDS = false;
1152
1153  private String annotSetName = "Key";
1154  private String outputSetName = null;
1155
1156  private double threshold = 0.5;
1157  private Properties configs = new Properties();
1158
1159  /** String to print when wrong command-line args */
1160  private static String usage =
1161    "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] [-verbose] directory-name application";
1162
1163}