CorpusBenchmarkTool.java
0001 /*
0002  *  CorpusBenchmarkTool.java
0003  *
0004  *  Copyright (c) 1995-2012, The University of Sheffield. See the file
0005  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
0006  *
0007  *  This file is part of GATE (see http://gate.ac.uk/), and is free
0008  *  software, licenced under the GNU Library General Public License,
0009  *  Version 2, June 1991 (in the distribution as file licence.html,
0010  *  and also available at http://gate.ac.uk/gate/licence.html).
0011  *
0012  *  Kalina Bontcheva, 24/Oct/2001
0013  *
0014  *  $Id: CorpusBenchmarkTool.java 17889 2014-04-21 10:39:34Z markagreenwood $
0015  */
0016 
0017 package gate.util;
0018 
0019 import gate.Annotation;
0020 import gate.AnnotationSet;
0021 import gate.Controller;
0022 import gate.Corpus;
0023 import gate.CorpusController;
0024 import gate.DataStore;
0025 import gate.Document;
0026 import gate.Factory;
0027 import gate.FeatureMap;
0028 import gate.Gate;
0029 import gate.LanguageResource;
0030 import gate.ProcessingResource;
0031 import gate.creole.ExecutionException;
0032 import gate.creole.ResourceInstantiationException;
0033 import gate.persist.PersistenceException;
0034 import gate.persist.SerialDataStore;
0035 
0036 import java.io.File;
0037 import java.io.FileInputStream;
0038 import java.io.FileWriter;
0039 import java.io.IOException;
0040 import java.io.InputStream;
0041 import java.io.Writer;
0042 import java.util.ArrayList;
0043 import java.util.Comparator;
0044 import java.util.Date;
0045 import java.util.HashMap;
0046 import java.util.HashSet;
0047 import java.util.Iterator;
0048 import java.util.List;
0049 import java.util.Map;
0050 import java.util.Properties;
0051 import java.util.Set;
0052 import java.util.StringTokenizer;
0053 import java.util.TreeSet;
0054 
0055 public class CorpusBenchmarkTool {
0056   private static final String MARKED_DIR_NAME = "marked";
0057   private static final String CLEAN_DIR_NAME = "clean";
0058   private static final String CVS_DIR_NAME = "Cvs";
0059   private static final String PROCESSED_DIR_NAME = "processed";
0060   private static final String ERROR_DIR_NAME = "err";
0061 
0062   public CorpusBenchmarkTool() {}
0063 
0064   public void initPRs() {
0065     if (applicationFile == null)
0066       throw new GateRuntimeException("Application not set!");
0067     
0068     try {
0069       Out.prln("App file is: " + applicationFile.getAbsolutePath());
0070       application = (Controllergate.util.persistence.PersistenceManager
0071                     .loadObjectFromFile(applicationFile);
0072     }
0073     catch (Exception ex) {
0074       throw new GateRuntimeException("Corpus Benchmark Tool:" + ex.getMessage(), ex);
0075     }
0076   //initPRs
0077 
0078   public void unloadPRs() {
0079     //we have nothing to unload if no PRs are loaded
0080     if (isMarkedStored)
0081       return;
0082 
0083   }
0084 
0085   public void execute() {
0086     execute(startDir);
0087     if (application != null) {
0088       javax.swing.SwingUtilities.invokeLater(new Runnable() {
0089         @Override
0090         public void run() {
0091 
0092           Iterator<ProcessingResource> iter = new ArrayList<ProcessingResource>(application.getPRs()).iterator();
0093           while (iter.hasNext())
0094             Factory.deleteResource(iter.next());
0095 
0096           Factory.deleteResource(application);
0097         }
0098       });
0099     }
0100   }
0101 
0102   public void init() {
0103     //first read the corpus_tool.properties file from the reference dir
0104     File propFile = new File(getStartDirectory(),"corpus_tool.properties");
0105     if (!propFile.exists())
0106       propFile = new File("corpus_tool.properties");    
0107     Out.prln("Loading properties from " + propFile.getAbsolutePath());
0108     if (propFile.exists()) {
0109       try {
0110         InputStream inputStream = new FileInputStream(propFile);
0111         this.configs.load(inputStream);
0112         String thresholdString = this.configs.getProperty("threshold");
0113         if (thresholdString != null && !thresholdString.equals("")) {
0114           thresholdString=thresholdString.trim();
0115           this.threshold = (new Double(thresholdString)).doubleValue();
0116           Out.prln("New threshold is: " this.threshold + "<P>\n");
0117         }
0118         String setName = this.configs.getProperty("annotSetName");
0119         if (setName != null && !setName.equals("")) {
0120           setName=setName.trim();
0121           Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
0122           this.annotSetName = setName;
0123         }
0124         setName = this.configs.getProperty("outputSetName");
0125         if (setName != null && !setName.equals("")) {
0126           setName=setName.trim();
0127           Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
0128           this.outputSetName = setName;
0129         }
0130         String encodingString = this.configs.getProperty("encoding");
0131         if (encodingString != null && !encodingString.equals("")) {
0132           encodingString=encodingString.trim();
0133           this.documentEncoding = encodingString;
0134           Out.prln("New encoding is: " this.documentEncoding + "<P>\n");
0135         }
0136         String types = this.configs.getProperty("annotTypes");
0137         if (types != null && !types.equals("")) {
0138           types=types.trim();
0139           Out.prln("Using annotation types from the properties file. <P>\n");
0140           StringTokenizer strTok = new StringTokenizer(types, ";");
0141           annotTypes = new ArrayList<String>();
0142           while (strTok.hasMoreTokens())
0143             annotTypes.add(strTok.nextToken());
0144         }
0145         else {
0146           annotTypes = new ArrayList<String>();
0147           annotTypes.add("Organization");
0148           annotTypes.add("Person");
0149           annotTypes.add("Date");
0150           annotTypes.add("Location");
0151           annotTypes.add("Address");
0152           annotTypes.add("Money");
0153           annotTypes.add("Percent");
0154           annotTypes.add("GPE");
0155           annotTypes.add("Facility");
0156         }
0157         String features = this.configs.getProperty("annotFeatures");
0158         Set<String> result = new HashSet<String>();
0159         if (features != null && !features.equals("")) {
0160           features=features.trim();
0161           Out.pr("Using annotation features from the properties file. \n");
0162           java.util.StringTokenizer tok =
0163               new java.util.StringTokenizer(features, ";");
0164           String current;
0165           while (tok.hasMoreTokens()) {
0166             current = tok.nextToken();
0167             result.add(current);
0168           // while
0169         }
0170         diffFeaturesSet = result;
0171         Out.prln("Features: " + diffFeaturesSet + " <P>\n");
0172 
0173       }
0174       catch (IOException ex) {
0175         throw new GateRuntimeException("Error loading " + propFile.getAbsolutePath(), ex);
0176       }
0177     }
0178     else {
0179       Err.prln(propFile.getAbsolutePath() " does not exist, using default settings");
0180       this.configs = new Properties();
0181     }
0182 
0183     //we only initialise the PRs if they are going to be used
0184     //for processing unprocessed documents
0185     if (!this.isMarkedStored)
0186       initPRs();
0187 
0188   }
0189 
0190   public void execute(File dir) {
0191     if (dir == null)
0192       return;
0193     //first set the current directory to be the given one
0194     currDir = dir;
0195 
0196     File processedDir = null;
0197     File cleanDir = null;
0198     File markedDir = null;
0199     File errorDir = null;
0200 
0201     List<File> subDirs = new ArrayList<File>();
0202     File[] dirArray = currDir.listFiles();
0203     if (dirArray == null)return;
0204     for (int i = 0; i < dirArray.length; i++) {
0205       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
0206         continue;
0207       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
0208         cleanDir = dirArray[i];
0209       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
0210         markedDir = dirArray[i];
0211       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
0212         processedDir = dirArray[i];
0213       else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
0214         errorDir = dirArray[i];
0215       else
0216         subDirs.add(dirArray[i]);
0217     }
0218 
0219     if (cleanDir == null)return;
0220     Out.prln("Processing directory: " + currDir + "<P>");
0221 
0222     if (this.isGenerateMode)
0223       generateCorpus(cleanDir, processedDir);
0224     else
0225       evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
0226 
0227       //if no more subdirs left, return
0228     if (subDirs.isEmpty())
0229       return;
0230 
0231     //there are more subdirectories to traverse, so iterate through
0232     for (int j = 0; j < subDirs.size(); j++)
0233       execute(subDirs.get(j));
0234 
0235   //execute(dir)
0236 
0237   public static void main(String[] argsthrows GateException {
0238     Out.prln("<HTML>");
0239     Out.prln("<HEAD>");
0240     Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
0241     for (int argC = 0; argC < args.length; ++argC)
0242       Out.pr(args[argC" ");
0243     Out.pr(" on " new Date() "</TITLE> </HEAD>");
0244     Out.prln("<BODY>");
0245     Out.prln("Please wait while GATE tools are initialised. <P>");
0246     // initialise GATE
0247     Gate.init();
0248 
0249     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
0250 
0251     if (args.length < 1)throw new GateException(usage);
0252     int i = 0;
0253     while (i < args.length && args[i].startsWith("-")) {
0254       if (args[i].equals("-generate")) {
0255         Out.prln("Generating the corpus... <P>");
0256         corpusTool.setGenerateMode(true);
0257       }
0258       else if (args[i].equals("-marked_clean")) {
0259         Out.prln("Evaluating current grammars against human-annotated...<P>");
0260         corpusTool.setMarkedClean(true);
0261       }
0262       else if (args[i].equals("-marked_stored")) {
0263         Out.prln("Evaluating stored documents against human-annotated...<P>");
0264         corpusTool.setMarkedStored(true);
0265       }
0266       else if (args[i].equals("-marked_ds")) {
0267         Out.prln("Looking for marked docs in a datastore...<P>");
0268         corpusTool.setMarkedDS(true);
0269       }
0270       else if (args[i].equals("-verbose")) {
0271         Out.prln("Running in verbose mode. Will generate annotation " +
0272                  "information when precision/recall are lower than " +
0273                  corpusTool.getThreshold() "<P>");
0274         corpusTool.setVerboseMode(true);
0275       }
0276       else if (args[i].equals("-moreinfo")) {
0277         Out.prln("Show more details in document table...<P>");
0278         corpusTool.setMoreInfo(true);
0279       }
0280       i++; //just ignore the option, which we do not recognise
0281     //while
0282 
0283     String dirName = args[i];
0284     File dir = new File(dirName);
0285     if (!dir.isDirectory())
0286       throw new GateException(usage);
0287 
0288     //get the last argument which is the application
0289     i++;
0290     String appName = args[i];
0291     File appFile = new File(appName);
0292     if (!appFile.isFile())
0293       throw new GateException(usage);
0294     else
0295       corpusTool.setApplicationFile(appFile);
0296 
0297     corpusTool.init();
0298     corpusWordCount = 0;
0299 
0300     Out.prln("Measuring annotaitions of types: " +
0301              CorpusBenchmarkTool.annotTypes + "<P>");
0302 
0303     corpusTool.setStartDirectory(dir);
0304     corpusTool.execute();
0305     //if we're not generating the corpus, then print the precision and recall
0306     //statistics for the processed corpus
0307     if (!corpusTool.getGenerateMode())
0308       corpusTool.printStatistics();
0309 
0310     Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
0311     Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
0312     Out.prln("<BR>Overall average fMeasure: " + corpusTool.getFMeasureAverage());
0313     if (corpusWordCount == 0)
0314       Out.prln("<BR>No Token annotations to count words in the corpus.");
0315     else
0316       Out.prln("<BR>Overall word count: " + corpusWordCount);
0317 
0318     if (hasProcessed) {
0319       Out.prln("<P>Old Processed: ");
0320       Out.prln("<BR>Overall average precision: "
0321                + corpusTool.getPrecisionAverageProc());
0322       Out.prln("<BR>Overall average recall: "
0323                + corpusTool.getRecallAverageProc());
0324       Out.prln("<BR>Overall average fMeasure: "
0325                + corpusTool.getFMeasureAverageProc());
0326     }
0327     Out.prln("<BR>Finished! <P>");
0328     Out.prln("</BODY>");
0329     Out.prln("</HTML>");
0330 
0331     System.exit(0);
0332 
0333   //main
0334 
0335   public void setGenerateMode(boolean mode) {
0336     isGenerateMode = mode;
0337   //setGenerateMode
0338 
0339   public boolean getGenerateMode() {
0340     return isGenerateMode;
0341   //getGenerateMode
0342 
0343   public boolean getVerboseMode() {
0344     return isVerboseMode;
0345   //getVerboseMode
0346 
0347   public void setVerboseMode(boolean mode) {
0348     isVerboseMode = mode;
0349   //setVerboseMode
0350 
0351   public void setMoreInfo(boolean mode) {
0352     isMoreInfoMode = mode;
0353   // setMoreInfo
0354 
0355   public boolean getMoreInfo() {
0356     return isMoreInfoMode;
0357   // getMoreInfo
0358 
0359   public void setDiffFeaturesList(Set<String> features) {
0360     diffFeaturesSet = features;
0361   // setDiffFeaturesList
0362 
0363   public Set<String> getDiffFeaturesList() {
0364     return diffFeaturesSet;
0365   // getDiffFeaturesList
0366 
0367   public void setMarkedStored(boolean mode) {
0368     isMarkedStored = mode;
0369   // setMarkedStored
0370 
0371   public boolean getMarkedStored() {
0372     return isMarkedStored;
0373   // getMarkedStored
0374 
0375   public void setMarkedClean(boolean mode) {
0376     isMarkedClean = mode;
0377   //
0378 
0379   public boolean getMarkedClean() {
0380     return isMarkedClean;
0381   //
0382 
0383   public void setMarkedDS(boolean mode) {
0384     isMarkedDS = mode;
0385   //
0386 
0387   public boolean getMarkedDS() {
0388     return isMarkedDS;
0389   //
0390 
0391   public void setApplicationFile(File newAppFile) {
0392     applicationFile = newAppFile;
0393   }
0394 
0395   /**
0396    * Returns the average precision over the entire set of processed documents.
0397    <P>
0398    * If the tool has been evaluating the original documents against the
0399    * previously-stored automatically annotated ones, then the precision
0400    * will be the average precision on those two sets. <P>
0401    * If the tool was run in -marked mode, i.e., was evaluating the stored
0402    * automatically processed ones against the human-annotated ones, then
0403    * the precision will be the average precision on those two sets of documents.
0404    */
0405   public double getPrecisionAverage() {
0406     return precisionSum / docNumber;
0407   }
0408 
0409   /**
0410    * Returns the average recall over the entire set of processed documents.
0411    <P>
0412    * If the tool has been evaluating the original documents against the
0413    * previously-stored automatically annotated ones, then the recall
0414    * will be the average recall on those two sets. <P>
0415    * If the tool was run in -marked mode, i.e., was evaluating the stored
0416    * automatically processed ones against the human-annotated ones, then
0417    * the recall will be the average recall on those two sets of documents.
0418    */
0419   public double getRecallAverage() {
0420     return recallSum / docNumber;
0421   }
0422 
0423   public double getFMeasureAverage() {
0424     return fMeasureSum / docNumber;
0425   }
0426 
0427   /** For processed documents */
0428   public double getPrecisionAverageProc() {
0429     return proc_precisionSum / docNumber;
0430   }
0431 
0432   public double getRecallAverageProc() {
0433     return proc_recallSum / docNumber;
0434   }
0435 
0436   public double getFMeasureAverageProc() {
0437     return proc_fMeasureSum / docNumber;
0438   }
0439 
0440   public boolean isGenerateMode() {
0441     return isGenerateMode == true;
0442   //isGenerateMode
0443 
0444   public double getThreshold() {
0445     return threshold;
0446   }
0447 
0448   public void setThreshold(double newValue) {
0449     threshold = newValue;
0450   }
0451 
0452   public File getStartDirectory() {
0453     return startDir;
0454   //getStartDirectory
0455 
0456   public void setStartDirectory(File dir) {
0457     startDir = dir;
0458   //setStartDirectory
0459 
0460   protected void generateCorpus(File fileDir, File outputDir) {
0461     //1. check if we have input files
0462     if (fileDir == null)
0463       return;
0464     //2. create the output directory or clean it up if needed
0465     File outDir = outputDir;
0466     if (outputDir == null) {
0467       outDir = new File(currDir, PROCESSED_DIR_NAME);
0468     }
0469     else {
0470       // get rid of the directory, coz datastore wants it clean
0471       if (!Files.rmdir(outDir))
0472         Out.prln("cannot delete old output directory: " + outDir);
0473     }
0474     outDir.mkdir();
0475 
0476     //create the datastore and process each document
0477     try {
0478       SerialDataStore sds = new SerialDataStore(outDir.toURI().toURL().toString());
0479       sds.create();
0480       sds.open();
0481 
0482       File[] files = fileDir.listFiles();
0483       for (int i = 0; i < files.length; i++) {
0484         if (!files[i].isFile())
0485           continue;
0486         // create a document
0487         Out.prln("Processing and storing document: " + files[i].toURI().toURL() "<P>");
0488 
0489         FeatureMap params = Factory.newFeatureMap();
0490         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURI().toURL());
0491         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0492 
0493         FeatureMap features = Factory.newFeatureMap();
0494 //        Gate.setHiddenAttribute(features, true);
0495 
0496         // create the document
0497         final Document doc = (DocumentFactory.createResource(
0498             "gate.corpora.DocumentImpl", params, features
0499             );
0500 
0501         doc.setName(files[i].getName());
0502         
0503         processDocument(doc);
0504         final LanguageResource lr = sds.adopt(doc);
0505         sds.sync(lr);
0506         javax.swing.SwingUtilities.invokeLater(new Runnable() {
0507           @Override
0508           public void run() {
0509             Factory.deleteResource(doc);
0510             Factory.deleteResource(lr);
0511           }
0512         });
0513       //for
0514       sds.close();
0515     }
0516     catch (java.net.MalformedURLException ex) {
0517       throw (GateRuntimeException)
0518         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0519         .initCause(ex);
0520     }
0521     catch (PersistenceException ex1) {
0522       throw (GateRuntimeException)
0523         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0524         .initCause(ex1);
0525     }
0526     catch (ResourceInstantiationException ex2) {
0527       throw (GateRuntimeException)
0528         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0529         .initCause(ex2);
0530     }
0531   //generateCorpus
0532 
0533   protected void evaluateCorpus(File fileDir,
0534                                 File processedDir, File markedDir,
0535                                 File errorDir) {
0536     //1. check if we have input files and the processed Dir
0537     if (fileDir == null || !fileDir.exists())
0538       return;
0539     if (processedDir == null || !processedDir.exists())
0540 
0541       //if the user wants evaluation of marked and stored that's not possible
0542       if (isMarkedStored) {
0543         Out.prln("Cannot evaluate because no processed documents exist.");
0544         return;
0545       }
0546       else
0547         isMarkedClean = true;
0548 
0549         // create the error directory or clean it up if needed
0550     File errDir = null;
0551     if (isMoreInfoMode) {
0552       errDir = errorDir;
0553       if (errDir == null) {
0554         errDir = new File(currDir, ERROR_DIR_NAME);
0555       }
0556       else {
0557         // get rid of the directory, coz we wants it clean
0558         if (!Files.rmdir(errDir))
0559           Out.prln("cannot delete old error directory: " + errDir);
0560       }
0561       Out.prln("Create error directory: " + errDir + "<BR><BR>");
0562       errDir.mkdir();
0563     }
0564 
0565     //looked for marked texts only if the directory exists
0566     boolean processMarked = markedDir != null && markedDir.exists();
0567     if (!processMarked && (isMarkedStored || isMarkedClean)) {
0568       Out.prln("Cannot evaluate because no human-annotated documents exist.");
0569       return;
0570     }
0571 
0572     if (isMarkedStored) {
0573       evaluateMarkedStored(markedDir, processedDir, errDir);
0574       return;
0575     }
0576     else if (isMarkedClean) {
0577       evaluateMarkedClean(markedDir, fileDir, errDir);
0578       return;
0579     }
0580 
0581     Document persDoc = null;
0582     Document cleanDoc = null;
0583     Document markedDoc = null;
0584 
0585     //open the datastore and process each document
0586     try {
0587       //open the data store
0588       DataStore sds = Factory.openDataStore
0589                       ("gate.persist.SerialDataStore",
0590                        processedDir.toURI().toURL().toExternalForm());
0591 
0592       List<String> lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
0593       for (int i = 0; i < lrIDs.size(); i++) {
0594         String docID = lrIDs.get(i);
0595 
0596         //read the stored document
0597         FeatureMap features = Factory.newFeatureMap();
0598         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
0599         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
0600         FeatureMap hparams = Factory.newFeatureMap();
0601 //        Gate.setHiddenAttribute(hparams, true);
0602 
0603         persDoc = (DocumentFactory.createResource(
0604             "gate.corpora.DocumentImpl",
0605             features, hparams);
0606 
0607         if (isMoreInfoMode) {
0608           StringBuffer errName = new StringBuffer(persDoc.getName());
0609           errName.replace(
0610               persDoc.getName().lastIndexOf("."),
0611               persDoc.getName().length(),
0612               ".err");
0613           Out.prln("<H2>" +
0614                    "<a href=\"err/" + errName.toString() "\">"
0615                    + persDoc.getName() "</a>" "</H2>");
0616         }
0617         else
0618           Out.prln("<H2>" + persDoc.getName() "</H2>");
0619 
0620         File cleanDocFile = new File(fileDir, persDoc.getName());
0621         //try reading the original document from clean
0622         if (!cleanDocFile.exists()) {
0623           Out.prln("Warning: Cannot find original document " +
0624                    persDoc.getName() " in " + fileDir);
0625         }
0626         else {
0627           FeatureMap params = Factory.newFeatureMap();
0628           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURI().toURL());
0629           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0630                      documentEncoding);
0631 
0632           // create the document
0633           cleanDoc = (DocumentFactory.createResource(
0634               "gate.corpora.DocumentImpl", params, hparams);
0635           cleanDoc.setName(persDoc.getName());
0636         }
0637 
0638         //try finding the marked document
0639         StringBuffer docName = new StringBuffer(persDoc.getName());
0640         if (!isMarkedDS) {
0641           docName.replace(
0642               persDoc.getName().lastIndexOf("."),
0643               docName.length(),
0644               ".xml");
0645           File markedDocFile = new File(markedDir, docName.toString());
0646           if (!processMarked || !markedDocFile.exists()) {
0647             Out.prln("Warning: Cannot find human-annotated document " +
0648                      markedDocFile + " in " + markedDir);
0649           }
0650           else {
0651             FeatureMap params = Factory.newFeatureMap();
0652             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0653                        markedDocFile.toURI().toURL());
0654             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0655                        documentEncoding);
0656 
0657             // create the document
0658             markedDoc = (DocumentFactory.createResource(
0659                 "gate.corpora.DocumentImpl", params, hparams);
0660             markedDoc.setName(persDoc.getName());
0661           }
0662         }
0663         else {
0664           //open marked from a DS
0665           //open the data store
0666           DataStore sds1 = Factory.openDataStore
0667                            ("gate.persist.SerialDataStore",
0668                             markedDir.toURI().toURL().toExternalForm());
0669 
0670           List<String> lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0671           boolean found = false;
0672           int k = 0;
0673           //search for the marked doc with the same name
0674           while (k < lrIDs1.size() && !found) {
0675             String docID1 = lrIDs1.get(k);
0676 
0677             //read the stored document
0678             FeatureMap features1 = Factory.newFeatureMap();
0679             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
0680             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
0681             Document tempDoc = (DocumentFactory.createResource(
0682                 "gate.corpora.DocumentImpl",
0683                 features1, hparams);
0684             //check whether this is our doc
0685             if ( ( (StringtempDoc.getFeatures().get("gate.SourceURL")).
0686                 endsWith(persDoc.getName())) {
0687               found = true;
0688               markedDoc = tempDoc;
0689             }
0690             else k++;
0691           }
0692         }
0693 
0694         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
0695 
0696         if (persDoc != null) {
0697           final gate.Document pd = persDoc;
0698           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0699             @Override
0700             public void run() {
0701               Factory.deleteResource(pd);
0702             }
0703           });
0704         }
0705         if (cleanDoc != null) {
0706           final gate.Document cd = cleanDoc;
0707           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0708             @Override
0709             public void run() {
0710               Factory.deleteResource(cd);
0711             }
0712           });
0713         }
0714         if (markedDoc != null) {
0715           final gate.Document md = markedDoc;
0716           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0717             @Override
0718             public void run() {
0719               Factory.deleteResource(md);
0720             }
0721           });
0722         }
0723 
0724       //for loop through saved docs
0725       sds.close();
0726     }
0727     catch (java.net.MalformedURLException ex) {
0728       throw (GateRuntimeException)
0729         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0730         .initCause(ex);
0731     }
0732     catch (PersistenceException ex1) {
0733       throw (GateRuntimeException)
0734         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0735         .initCause(ex1);
0736     }
0737     catch (ResourceInstantiationException ex2) {
0738       throw (GateRuntimeException)
0739         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0740         .initCause(ex2);
0741     }
0742 
0743   //evaluateCorpus
0744 
0745   protected void evaluateMarkedStored(File markedDir, File storedDir,
0746                                       File errDir) {
0747     Document persDoc = null;
0748     Document cleanDoc = null;
0749     Document markedDoc = null;
0750 
0751     //open the datastore and process each document
0752     try {
0753       //open the data store
0754       DataStore sds = Factory.openDataStore
0755                       ("gate.persist.SerialDataStore",
0756                        storedDir.toURI().toURL().toExternalForm());
0757 
0758       List<String> lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
0759       for (int i = 0; i < lrIDs.size(); i++) {
0760         String docID = lrIDs.get(i);
0761 
0762         //read the stored document
0763         FeatureMap features = Factory.newFeatureMap();
0764         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
0765         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
0766 
0767         FeatureMap hparams = Factory.newFeatureMap();
0768 //        Gate.setHiddenAttribute(hparams, true);
0769 
0770         persDoc = (DocumentFactory.createResource(
0771             "gate.corpora.DocumentImpl",
0772             features, hparams);
0773 
0774         if (isMoreInfoMode) {
0775           StringBuffer errName = new StringBuffer(persDoc.getName());
0776           errName.replace(
0777               persDoc.getName().lastIndexOf("."),
0778               persDoc.getName().length(),
0779               ".err");
0780           Out.prln("<H2>" +
0781                    "<a href=\"err/" + errName.toString() "\">"
0782                    + persDoc.getName() "</a>" "</H2>");
0783         }
0784         else
0785           Out.prln("<H2>" + persDoc.getName() "</H2>");
0786 
0787         if (!this.isMarkedDS) { //try finding the marked document as file
0788           StringBuffer docName = new StringBuffer(persDoc.getName());
0789           docName.replace(
0790               persDoc.getName().lastIndexOf("."),
0791               docName.length(),
0792               ".xml");
0793           File markedDocFile = new File(markedDir, docName.toString());
0794           if (!markedDocFile.exists()) {
0795             Out.prln("Warning: Cannot find human-annotated document " +
0796                      markedDocFile + " in " + markedDir);
0797           }
0798           else {
0799             FeatureMap params = Factory.newFeatureMap();
0800             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0801                        markedDocFile.toURI().toURL());
0802             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0803                        documentEncoding);
0804 
0805             // create the document
0806             markedDoc = (DocumentFactory.createResource(
0807                 "gate.corpora.DocumentImpl", params, hparams);
0808             markedDoc.setName(persDoc.getName());
0809           //find marked as file
0810         }
0811         else {
0812           try {
0813             //open marked from a DS
0814             //open the data store
0815             DataStore sds1 = Factory.openDataStore
0816                              ("gate.persist.SerialDataStore",
0817                               markedDir.toURI().toURL().toExternalForm());
0818 
0819             List<String> lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0820             boolean found = false;
0821             int k = 0;
0822             //search for the marked doc with the same name
0823             while (k < lrIDs1.size() && !found) {
0824               String docID1 = lrIDs1.get(k);
0825 
0826               //read the stored document
0827               FeatureMap features1 = Factory.newFeatureMap();
0828               features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
0829               features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
0830               Document tempDoc = (DocumentFactory.createResource(
0831                   "gate.corpora.DocumentImpl",
0832                   features1, hparams);
0833               //check whether this is our doc
0834               if ( ( (StringtempDoc.getFeatures().get("gate.SourceURL")).
0835                   endsWith(persDoc.getName())) {
0836                 found = true;
0837                 markedDoc = tempDoc;
0838               }
0839               else k++;
0840             }
0841           }
0842           catch (java.net.MalformedURLException ex) {
0843             Out.prln("Error finding marked directory " +
0844                      markedDir.getAbsolutePath());
0845           }
0846           catch (gate.persist.PersistenceException ex1) {
0847             Out.prln(
0848                 "Error opening marked as a datastore (-marked_ds specified)");
0849           }
0850           catch (gate.creole.ResourceInstantiationException ex2) {
0851             Out.prln(
0852                 "Error opening marked as a datastore (-marked_ds specified)");
0853           }
0854         }
0855 
0856         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
0857         if (persDoc != null) {
0858           final gate.Document pd = persDoc;
0859           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0860             @Override
0861             public void run() {
0862               Factory.deleteResource(pd);
0863             }
0864           });
0865         }
0866         if (markedDoc != null) {
0867           final gate.Document md = markedDoc;
0868           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0869             @Override
0870             public void run() {
0871               Factory.deleteResource(md);
0872             }
0873           });
0874         }
0875 
0876       //for loop through saved docs
0877       sds.close();
0878 
0879     }
0880     catch (java.net.MalformedURLException ex) {
0881       throw (GateRuntimeException)
0882         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0883         .initCause(ex);
0884     }
0885     catch (PersistenceException ex1) {
0886       throw (GateRuntimeException)
0887         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0888         .initCause(ex1);
0889     }
0890     catch (ResourceInstantiationException ex2) {
0891       throw (GateRuntimeException)
0892         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0893         .initCause(ex2);
0894     }
0895 
0896   //evaluateMarkedStored
0897 
0898   protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
0899     Document persDoc = null;
0900     Document cleanDoc = null;
0901     Document markedDoc = null;
0902 
0903     File[] cleanDocs = cleanDir.listFiles();
0904     for (int i = 0; i < cleanDocs.length; i++) {
0905       if (!cleanDocs[i].isFile())
0906         continue;
0907 
0908       //try reading the original document from clean
0909       FeatureMap params = Factory.newFeatureMap();
0910       try {
0911         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURI().toURL());
0912       }
0913       catch (java.net.MalformedURLException ex) {
0914         Out.prln("Cannot create document from file: " +
0915                  cleanDocs[i].getAbsolutePath());
0916         continue;
0917       }
0918       //params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
0919       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0920 
0921       FeatureMap hparams = Factory.newFeatureMap();
0922 //      Gate.setHiddenAttribute(hparams, true);
0923 
0924       // create the document
0925       try {
0926         cleanDoc = (DocumentFactory.createResource(
0927             "gate.corpora.DocumentImpl", params, hparams, cleanDocs[i].getName());
0928       }
0929       catch (gate.creole.ResourceInstantiationException ex) {
0930         Out.prln("Cannot create document from file: " +
0931                  cleanDocs[i].getAbsolutePath());
0932         continue;
0933       }
0934 
0935       if (isMoreInfoMode) {
0936         StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
0937         errName.replace(
0938             cleanDocs[i].getName().lastIndexOf("."),
0939             cleanDocs[i].getName().length(),
0940             ".err");
0941         Out.prln("<H2>" +
0942                  "<a href=\"err/" + errName.toString() "\">"
0943                  + cleanDocs[i].getName() "</a>" "</H2>");
0944       }
0945       else
0946         Out.prln("<H2>" + cleanDocs[i].getName() "</H2>");
0947 
0948         //try finding the marked document
0949       if (!isMarkedDS) {
0950         StringBuffer docName = new StringBuffer(cleanDoc.getName());
0951         docName.replace(
0952             cleanDoc.getName().lastIndexOf("."),
0953             docName.length(),
0954             ".xml");
0955         File markedDocFile = new File(markedDir, docName.toString());
0956         if (!markedDocFile.exists()) {
0957           Out.prln("Warning: Cannot find human-annotated document " +
0958                    markedDocFile + " in " + markedDir);
0959           continue;
0960         }
0961         else {
0962           params = Factory.newFeatureMap();
0963           try {
0964             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0965                        markedDocFile.toURI().toURL());
0966           }
0967           catch (java.net.MalformedURLException ex) {
0968             Out.prln("Cannot create document from file: " +
0969                      markedDocFile.getAbsolutePath());
0970             continue;
0971           }
0972           //params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
0973           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0974 
0975           // create the document
0976           try {
0977             markedDoc = (DocumentFactory.createResource(
0978                 "gate.corpora.DocumentImpl", params,
0979                 hparams, cleanDoc.getName());
0980           }
0981           catch (gate.creole.ResourceInstantiationException ex) {
0982             Out.prln("Cannot create document from file: " +
0983                      markedDocFile.getAbsolutePath());
0984             continue;
0985           }
0986 
0987         //if markedDoc exists
0988       }
0989       else {
0990         try {
0991           //open marked from a DS
0992           //open the data store
0993           DataStore sds1 = Factory.openDataStore
0994                            ("gate.persist.SerialDataStore",
0995                             markedDir.toURI().toURL().toExternalForm());
0996 
0997           List<String> lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0998           boolean found = false;
0999           int k = 0;
1000           //search for the marked doc with the same name
1001           while (k < lrIDs1.size() && !found) {
1002             String docID1 = lrIDs1.get(k);
1003 
1004             //read the stored document
1005             FeatureMap features1 = Factory.newFeatureMap();
1006             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
1007             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
1008             Document tempDoc = (DocumentFactory.createResource(
1009                 "gate.corpora.DocumentImpl",
1010                 features1, hparams);
1011             //check whether this is our doc
1012             if ( ( (StringtempDoc.getFeatures().get("gate.SourceURL")).
1013                 endsWith(cleanDoc.getName())) {
1014               found = true;
1015               markedDoc = tempDoc;
1016             }
1017             else k++;
1018           }
1019         }
1020         catch (java.net.MalformedURLException ex) {
1021           Out.prln("Error finding marked directory " +
1022                    markedDir.getAbsolutePath());
1023         }
1024         catch (gate.persist.PersistenceException ex1) {
1025           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
1026         }
1027         catch (gate.creole.ResourceInstantiationException ex2) {
1028           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
1029         }
1030       //if using a DS for marked
1031 
1032       try {
1033         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
1034       }
1035       catch (gate.creole.ResourceInstantiationException ex) {
1036         ex.printStackTrace();
1037         Out.prln("Evaluate failed on document: " + cleanDoc.getName());
1038       }
1039       if (persDoc != null) {
1040         final gate.Document pd = persDoc;
1041         javax.swing.SwingUtilities.invokeLater(new Runnable() {
1042           @Override
1043           public void run() {
1044             Factory.deleteResource(pd);
1045           }
1046         });
1047       }
1048       if (cleanDoc != null) {
1049         final gate.Document cd = cleanDoc;
1050         javax.swing.SwingUtilities.invokeLater(new Runnable() {
1051           @Override
1052           public void run() {
1053             Factory.deleteResource(cd);
1054           }
1055         });
1056       }
1057       if (markedDoc != null) {
1058         final gate.Document md = markedDoc;
1059         javax.swing.SwingUtilities.invokeLater(new Runnable() {
1060           @Override
1061           public void run() {
1062             Factory.deleteResource(md);
1063           }
1064         });
1065       }
1066 
1067     //for loop through clean docs
1068 
1069   //evaluateMarkedClean
1070 
1071   protected void processDocument(Document doc) {
1072     try {
1073       if (application instanceof CorpusController) {
1074         Corpus tempCorpus = Factory.newCorpus("temp");
1075         tempCorpus.add(doc);
1076         ( (CorpusControllerapplication).setCorpus(tempCorpus);
1077         application.execute();
1078         Factory.deleteResource(tempCorpus);
1079         tempCorpus = null;
1080       }
1081       else {
1082         Iterator<ProcessingResource> iter = application.getPRs().iterator();
1083         while (iter.hasNext())
1084           iter.next().setParameterValue("document", doc);
1085         application.execute();
1086       }
1087     }
1088     catch (ResourceInstantiationException ex) {
1089       throw (RuntimeException)
1090         new RuntimeException("Error executing application: "
1091                                  + ex.getMessage())
1092         .initCause(ex);
1093     }
1094     catch (ExecutionException ex) {
1095       throw (RuntimeException)
1096         new RuntimeException("Error executing application: "
1097                                  + ex.getMessage())
1098         .initCause(ex);
1099     }
1100   }
1101 
1102   protected void evaluateDocuments(Document persDoc,
1103                                    Document cleanDoc, Document markedDoc,
1104                                    File errDirthrows
1105       ResourceInstantiationException {
1106     if (cleanDoc == null && markedDoc == null)
1107       return;
1108 
1109     //we've got no types to compare
1110     if (annotTypes == null || annotTypes.isEmpty())
1111       return;
1112 
1113     if (cleanDoc != null && !isMarkedStored) {
1114 
1115       processDocument(cleanDoc);
1116 
1117       int wordCount = countWords(cleanDoc);
1118       if (wordCount == 0)
1119         Out.prln("<BR>No Token annotations to count words in the document.");
1120       else
1121         Out.prln("<BR>Word count: " + wordCount);
1122       corpusWordCount += wordCount;
1123 
1124       if (!isMarkedClean)
1125         evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
1126       else
1127         evaluateTwoDocs(markedDoc, cleanDoc, errDir);
1128 
1129     }
1130     else
1131       evaluateTwoDocs(markedDoc, persDoc, errDir);
1132 
1133   }
1134 
1135   /**
1136    * Count all Token.kind=word annotations in the document
1137    */
1138   protected int countWords(Document annotDoc) {
1139     int count = 0;
1140 
1141     if (annotDoc == null)return 0;
1142     // check for Token in outputSetName
1143     AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
1144     if (tokens == null)return 0;
1145 
1146     Iterator<Annotation> it = tokens.iterator();
1147     Annotation currAnnotation;
1148     while (it.hasNext()) {
1149       currAnnotation = it.next();
1150       Object feature = currAnnotation.getFeatures().get("kind");
1151       if (feature != null && "word".equalsIgnoreCase( (Stringfeature))++count;
1152     // while
1153 
1154     return count;
1155   }
1156 
1157   protected void evaluateAllThree(Document persDoc,
1158                                   Document cleanDoc, Document markedDoc,
1159                                   File errDirthrows
1160       ResourceInstantiationException {
1161     //first start the table and its header
1162     printTableHeader();
1163 
1164     // store annotation diff in .err file
1165     Writer errWriter = null;
1166     if (isMoreInfoMode && errDir != null) {
1167       StringBuffer docName = new StringBuffer(cleanDoc.getName());
1168       docName.replace(
1169           cleanDoc.getName().lastIndexOf("."),
1170           docName.length(),
1171           ".err");
1172       File errFile = new File(errDir, docName.toString());
1173       //String encoding = ( (gate.corpora.DocumentImpl) cleanDoc).getEncoding();
1174       try {
1175         errWriter = new FileWriter(errFile, false);
1176         /*
1177                  if(encoding == null) {
1178           errWriter = new OutputStreamWriter(
1179               new FileOutputStream(errFile, false));
1180                  } else {
1181           errWriter = new OutputStreamWriter(
1182               new FileOutputStream(errFile, false), encoding);
1183                  }*/
1184       }
1185       catch (Exception ex) {
1186         Out.prln("Exception when creating the error file " + errFile + ": "
1187                  + ex.getMessage());
1188         errWriter = null;
1189       }
1190     }
1191 
1192     for (int jj = 0; jj < annotTypes.size(); jj++) {
1193       String annotType = annotTypes.get(jj);
1194 
1195       AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1196       //we don't have this annotation type in this document
1197       if (annotDiffer == null)
1198         continue;
1199 
1200       //increase the number of processed documents
1201       docNumber++;
1202       //add precison and recall to the sums
1203       updateStatistics(annotDiffer, annotType);
1204 
1205       AnnotationDiffer annotDiffer1 =
1206           measureDocs(markedDoc, persDoc, annotType);
1207 
1208       Out.prln("<TR>");
1209 
1210       if (isMoreInfoMode && annotDiffer1 != null
1211           &&
1212           (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1213            || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1214           )
1215         Out.prln("<TD> " + annotType + "_new" "</TD>");
1216       else
1217         Out.prln("<TD> " + annotType + "</TD>");
1218 
1219       if (isMoreInfoMode) {
1220         if (annotDiffer1 != nullupdateStatisticsProc(annotDiffer1, annotType);
1221 
1222         Out.prln("<TD>" + annotDiffer.getCorrectMatches() "</TD>");
1223         Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() "</TD>");
1224         Out.prln("<TD>" + annotDiffer.getMissing() "</TD>");
1225         Out.prln("<TD>" + annotDiffer.getSpurious() "</TD>");
1226       }
1227 
1228       Out.prln("<TD>");
1229 
1230       //check the precision first
1231       if (annotDiffer1 != null) {
1232 
1233         if (annotDiffer1.getPrecisionAverage()
1234             < annotDiffer.getPrecisionAverage()) {
1235           Out.prln("<P><Font color=blue> ");
1236           Out.prln(annotDiffer.getPrecisionAverage());
1237 
1238           if (!isMoreInfoMode) {
1239             Out.pr("<BR>Precision increase on human-marked from ");
1240             Out.pr(annotDiffer1.getPrecisionAverage() " to ");
1241             Out.prln(annotDiffer.getPrecisionAverage());
1242           }
1243           Out.prln(" </Font></P>");
1244         }
1245         else if (annotDiffer1.getPrecisionAverage()
1246                  > annotDiffer.getPrecisionAverage()) {
1247           Out.prln("<P><Font color=red> ");
1248           Out.prln(annotDiffer.getPrecisionAverage());
1249 
1250           if (!isMoreInfoMode) {
1251             Out.pr("<BR>Precision decrease on human-marked from ");
1252             Out.pr(annotDiffer1.getPrecisionAverage() " to ");
1253             Out.prln(annotDiffer.getPrecisionAverage());
1254           }
1255           Out.prln(" </Font></P>");
1256         }
1257         else
1258           Out.prln("<P> " + annotDiffer.getPrecisionAverage() +
1259                    " </P>");
1260       }
1261       else
1262         Out.prln("<P> " + annotDiffer.getPrecisionAverage() " </P>");
1263 
1264       Out.prln("</TD>");
1265 
1266       Out.prln("<TD>");
1267 
1268       //check the recall now
1269       if (annotDiffer1 != null) {
1270 
1271         if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1272           Out.prln("<P><Font color=blue> ");
1273           Out.prln(annotDiffer.getRecallAverage());
1274 
1275           if (!isMoreInfoMode) {
1276             Out.pr("<BR>Recall increase on human-marked from ");
1277             Out.pr(annotDiffer1.getRecallAverage() " to ");
1278             Out.prln(annotDiffer.getRecallAverage());
1279           }
1280           Out.prln(" </Font></P>");
1281         }
1282         else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1283           Out.prln("<P><Font color=red> ");
1284           Out.prln(annotDiffer.getRecallAverage());
1285 
1286           if (!isMoreInfoMode) {
1287             Out.pr("<BR>Recall decrease on human-marked from ");
1288             Out.pr(annotDiffer1.getRecallAverage() " to ");
1289             Out.prln(annotDiffer.getRecallAverage());
1290           }
1291           Out.prln(" </Font></P>");
1292         }
1293         else
1294           Out.prln("<P> " + annotDiffer.getRecallAverage() " </P>");
1295       }
1296       else
1297         Out.prln("<P> " + annotDiffer.getRecallAverage() " </P>");
1298 
1299       Out.prln("</TD>");
1300 
1301       //check the recall now
1302       if (isVerboseMode) {
1303         Out.prln("<TD>");
1304         if (annotDiffer.getRecallAverage() < threshold
1305             || annotDiffer.getPrecisionAverage() < threshold) {
1306           printAnnotations(annotDiffer, markedDoc, cleanDoc);
1307         }
1308         else {
1309           Out.prln("&nbsp;");
1310         }
1311         Out.prln("</TD>");
1312       }
1313 
1314       Out.prln("</TR>");
1315 
1316       // show one more table line for processed document
1317       if (isMoreInfoMode && annotDiffer1 != null
1318           &&
1319           (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1320            || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1321           ) {
1322 
1323         Out.prln("<TR>");
1324         Out.prln("<TD> " + annotType + "_old" "</TD>");
1325 
1326         Out.prln("<TD>" + annotDiffer1.getCorrectMatches() "</TD>");
1327         Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() "</TD>");
1328         Out.prln("<TD>" + annotDiffer1.getMissing() "</TD>");
1329         Out.prln("<TD>" + annotDiffer1.getSpurious() "</TD>");
1330 
1331         Out.prln("<TD>");
1332         if (annotDiffer1.getPrecisionAverage() <
1333             annotDiffer.getPrecisionAverage())
1334 
1335           Out.prln("<P><Font color=blue> " + annotDiffer1.getPrecisionAverage()
1336                    "</Font></P>");
1337         else if (annotDiffer1.getPrecisionAverage() >
1338                  annotDiffer.getPrecisionAverage())
1339           Out.prln(
1340               "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1341               " </Font></P>");
1342         else
1343           Out.prln(annotDiffer1.getPrecisionAverage());
1344 
1345         Out.prln("</TD>");
1346 
1347         Out.prln("<TD>");
1348         if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1349           Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1350                    " </Font></P>");
1351         else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1352           Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1353                    " </Font></P>");
1354         else
1355           Out.prln(annotDiffer1.getRecallAverage());
1356 
1357         Out.prln("</TD>");
1358 
1359         //check the recall now
1360         if (isVerboseMode) {
1361           // create error file and start writing
1362 
1363           Out.prln("<TD>");
1364         if (annotDiffer.getRecallAverage() < threshold
1365             || annotDiffer.getPrecisionAverage() < threshold) {
1366             printAnnotations(annotDiffer, markedDoc, cleanDoc);
1367           }
1368           else {
1369             Out.prln("&nbsp;");
1370           }
1371           Out.prln("</TD>");
1372         }
1373         Out.prln("</TR>");
1374       // if(isMoreInfoMode && annotDiff1 != null)
1375 
1376       if (isMoreInfoMode && errDir != null)
1377         storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1378     //for loop through annotation types
1379     Out.prln("</TABLE>");
1380 
1381     try {
1382       if (errWriter != null)
1383         errWriter.close();
1384     }
1385     catch (Exception ex) {
1386       Out.prln("Exception on close of error file " + errWriter + ": "
1387                + ex.getMessage());
1388     }
1389   //evaluateAllThree
1390 
1391   protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1392                                  File errDirthrows
1393       ResourceInstantiationException {
1394 
1395     //first start the table and its header
1396     printTableHeader();
1397 
1398     // store annotation diff in .err file
1399     Writer errWriter = null;
1400     if (isMoreInfoMode && errDir != null) {
1401       StringBuffer docName = new StringBuffer(keyDoc.getName());
1402       docName.replace(
1403           keyDoc.getName().lastIndexOf("."),
1404           docName.length(),
1405           ".err");
1406       File errFile = new File(errDir, docName.toString());
1407       //String encoding = ( (gate.corpora.DocumentImpl) keyDoc).getEncoding();
1408       try {
1409         errWriter = new FileWriter(errFile, false);
1410         /*
1411                  if(encoding == null) {
1412           errWriter = new OutputStreamWriter(
1413               new FileOutputStream(errFile, false));
1414                  } else {
1415           errWriter = new OutputStreamWriter(
1416               new FileOutputStream(errFile, false), encoding);
1417                  }*/
1418       }
1419       catch (Exception ex) {
1420         Out.prln("Exception when creating the error file " + errFile + ": "
1421                  + ex.getMessage());
1422         errWriter = null;
1423       }
1424     }
1425 
1426     for (int jj = 0; jj < annotTypes.size(); jj++) {
1427       String annotType = annotTypes.get(jj);
1428 
1429       AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1430       //we don't have this annotation type in this document
1431       if (annotDiff == null)
1432          continue;
1433 
1434       //increase the number of processed documents
1435       docNumber++;
1436       //add precison and recall to the sums
1437       updateStatistics(annotDiff, annotType);
1438 
1439       Out.prln("<TR>");
1440       Out.prln("<TD>" + annotType + "</TD>");
1441 
1442       if (isMoreInfoMode) {
1443         Out.prln("<TD>" + annotDiff.getCorrectMatches() "</TD>");
1444         Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() "</TD>");
1445         Out.prln("<TD>" + annotDiff.getMissing() "</TD>");
1446         Out.prln("<TD>" + annotDiff.getSpurious() "</TD>");
1447       }
1448 
1449       Out.prln("<TD>" + annotDiff.getPrecisionAverage() "</TD>");
1450       Out.prln("<TD>" + annotDiff.getRecallAverage() "</TD>");
1451       //check the recall now
1452       if (isVerboseMode) {
1453         Out.prln("<TD>");
1454         if (annotDiff.getRecallAverage() < threshold
1455             || annotDiff.getPrecisionAverage() < threshold) {
1456           printAnnotations(annotDiff, keyDoc, respDoc);
1457         }
1458         else {
1459           Out.prln("&nbsp;");
1460         }
1461         Out.prln("</TD>");
1462       }
1463       Out.prln("</TR>");
1464 
1465       if (isMoreInfoMode && errDir != null)
1466         storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1467     //for loop through annotation types
1468     Out.prln("</TABLE>");
1469 
1470     try {
1471       if (errWriter != null)
1472         errWriter.close();
1473     }
1474     catch (Exception ex) {
1475       Out.prln("Exception on close of error file " + errWriter + ": "
1476                + ex.getMessage());
1477     }
1478   //evaluateTwoDocs
1479 
1480   protected void printTableHeader() {
1481     Out.prln("<TABLE BORDER=1");
1482     Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1483 
1484     if (isMoreInfoMode)
1485       Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1486              "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1487 
1488     Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1489 
1490     if (isVerboseMode)
1491       Out.pr("<TD><B>Annotations</B></TD>");
1492 
1493     Out.prln("</TR>");
1494   }
1495 
1496   protected void updateStatistics(AnnotationDiffer annotDiffer,
1497                                   String annotType) {
1498     double precisionAverage = ( ( annotDiffer.
1499                                           getPrecisionLenient() +
1500                                           annotDiffer.getPrecisionStrict()) /
1501                                (2.0));
1502     if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1503     precisionSum += precisionAverage;
1504 
1505     double recallAverage = ( (annotDiffer.getRecallLenient() +
1506                                        annotDiffer.getRecallStrict()) /
1507                             (2.0));
1508     if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1509     recallSum += recallAverage;
1510 
1511     double fMeasureAverage = ( (annotDiffer.getFMeasureLenient(1.0+
1512                                          annotDiffer.getFMeasureStrict(1.0)) /
1513                               (2.0));
1514     if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1515     fMeasureSum += fMeasureAverage;
1516 
1517     Double oldPrecision = precisionByType.get(annotType);
1518     if (oldPrecision == null)
1519       precisionByType.put(annotType, new Double(precisionAverage));
1520     else
1521       precisionByType.put(annotType,
1522                           new Double(oldPrecision.doubleValue() + precisionAverage));
1523 
1524     Integer precCount = prCountByType.get(annotType);
1525     if (precCount == null)
1526       prCountByType.put(annotType, new Integer(1));
1527     else
1528       prCountByType.put(annotType, new Integer(precCount.intValue() 1));
1529 
1530     Double oldFMeasure = fMeasureByType.get(annotType);
1531     if (oldFMeasure == null)
1532       fMeasureByType.put(annotType, new Double(fMeasureAverage));
1533     else
1534       fMeasureByType.put(annotType,
1535                          new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1536 
1537     Integer fCount = fMeasureCountByType.get(annotType);
1538     if (fCount == null)
1539       fMeasureCountByType.put(annotType, new Integer(1));
1540     else
1541       fMeasureCountByType.put(annotType, new Integer(fCount.intValue() 1));
1542 
1543     Double oldRecall = recallByType.get(annotType);
1544     if (oldRecall == null)
1545       recallByType.put(annotType, new Double(recallAverage));
1546     else
1547       recallByType.put(annotType,
1548                        new Double(oldRecall.doubleValue() + recallAverage));
1549 
1550     Integer recCount = recCountByType.get(annotType);
1551     if (recCount == null)
1552       recCountByType.put(annotType, new Integer(1));
1553     else
1554       recCountByType.put(annotType, new Integer(recCount.intValue() 1));
1555 
1556       //Update the missing, spurious, correct, and partial counts
1557     Long oldMissingNo = missingByType.get(annotType);
1558     if (oldMissingNo == null)
1559       missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1560     else
1561       missingByType.put(annotType,
1562                         new Long(oldMissingNo.longValue() +
1563                                  annotDiffer.getMissing()));
1564 
1565     Long oldCorrectNo = correctByType.get(annotType);
1566     if (oldCorrectNo == null)
1567       correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1568     else
1569       correctByType.put(annotType,
1570                         new Long(oldCorrectNo.longValue() +
1571                                  annotDiffer.getCorrectMatches()));
1572 
1573     Long oldPartialNo = partialByType.get(annotType);
1574     if (oldPartialNo == null)
1575       partialByType.put(annotType,
1576                         new Long(annotDiffer.getPartiallyCorrectMatches()));
1577     else
1578       partialByType.put(annotType,
1579                         new Long(oldPartialNo.longValue() +
1580                                  annotDiffer.getPartiallyCorrectMatches()));
1581 
1582     Long oldSpuriousNo = spurByType.get(annotType);
1583     if (oldSpuriousNo == null)
1584       spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1585     else
1586       spurByType.put(annotType,
1587                      new Long(oldSpuriousNo.longValue() +
1588                               annotDiffer.getSpurious()));
1589   }
1590 
1591   /**
1592    * Update statistics for processed documents
1593    * The same procedure as updateStatistics with different hashTables
1594    */
1595   protected void updateStatisticsProc(AnnotationDiffer annotDiffer,
1596                                       String annotType) {
1597     hasProcessed = true;
1598     double precisionAverage = ( (annotDiffer.getPrecisionLenient() +
1599                                           annotDiffer.getPrecisionStrict()) /
1600                                (2.0));
1601     if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1602     proc_precisionSum += precisionAverage;
1603 
1604     double recallAverage = ( (annotDiffer.getRecallLenient() +
1605                                        annotDiffer.getRecallStrict()) /
1606                             (2.0));
1607     if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1608     proc_recallSum += recallAverage;
1609 
1610     double fMeasureAverage = ( (annotDiffer.getFMeasureLenient(1.0+
1611                                          annotDiffer.getFMeasureStrict(1.0)) /
1612                               (2.0));
1613     if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1614     proc_fMeasureSum += fMeasureAverage;
1615 
1616     Double oldPrecision = proc_precisionByType.get(annotType);
1617     if (oldPrecision == null)
1618       proc_precisionByType.put(annotType, new Double(precisionAverage));
1619     else
1620       proc_precisionByType.put(annotType,
1621                                new Double(oldPrecision.doubleValue() +
1622                                           precisionAverage));
1623     Integer precCount = proc_prCountByType.get(annotType);
1624     if (precCount == null)
1625       proc_prCountByType.put(annotType, new Integer(1));
1626     else
1627       proc_prCountByType.put(annotType, new Integer(precCount.intValue() 1));
1628 
1629     Double oldFMeasure = proc_fMeasureByType.get(annotType);
1630     if (oldFMeasure == null)
1631       proc_fMeasureByType.put(annotType,
1632                               new Double(fMeasureAverage));
1633     else
1634       proc_fMeasureByType.put(annotType,
1635                               new Double(oldFMeasure.doubleValue() +
1636                                          fMeasureAverage));
1637     Integer fCount = proc_fMeasureCountByType.get(annotType);
1638     if (fCount == null)
1639       proc_fMeasureCountByType.put(annotType, new Integer(1));
1640     else
1641       proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() 1));
1642 
1643     Double oldRecall = proc_recallByType.get(annotType);
1644     if (oldRecall == null)
1645       proc_recallByType.put(annotType,
1646                             new Double(recallAverage));
1647     else
1648       proc_recallByType.put(annotType,
1649                             new Double(oldRecall.doubleValue() +
1650                                        recallAverage));
1651     Integer recCount = proc_recCountByType.get(annotType);
1652     if (recCount == null)
1653       proc_recCountByType.put(annotType, new Integer(1));
1654     else
1655       proc_recCountByType.put(annotType, new Integer(recCount.intValue() 1));
1656 
1657       //Update the missing, spurious, correct, and partial counts
1658     Long oldMissingNo = proc_missingByType.get(annotType);
1659     if (oldMissingNo == null)
1660       proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1661     else
1662       proc_missingByType.put(annotType,
1663                              new Long(oldMissingNo.longValue() +
1664                                       annotDiffer.getMissing()));
1665 
1666     Long oldCorrectNo = proc_correctByType.get(annotType);
1667     if (oldCorrectNo == null)
1668       proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1669     else
1670       proc_correctByType.put(annotType,
1671                              new Long(oldCorrectNo.longValue() +
1672                                       annotDiffer.getCorrectMatches()));
1673 
1674     Long oldPartialNo = proc_partialByType.get(annotType);
1675     if (oldPartialNo == null)
1676       proc_partialByType.put(annotType,
1677                              new Long(annotDiffer.getPartiallyCorrectMatches()));
1678     else
1679       proc_partialByType.put(annotType,
1680                              new Long(oldPartialNo.longValue() +
1681                                       annotDiffer.getPartiallyCorrectMatches()));
1682 
1683     Long oldSpuriousNo = proc_spurByType.get(annotType);
1684     if (oldSpuriousNo == null)
1685       proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1686     else
1687       proc_spurByType.put(annotType,
1688                           new Long(oldSpuriousNo.longValue() +
1689                                    annotDiffer.getSpurious()));
1690   }
1691 
1692   public void printStatistics() {
1693 
1694     Out.prln("<H2> Statistics </H2>");
1695 
1696     /*
1697         Out.prln("<H3> Precision </H3>");
1698         if (precisionByType != null && !precisionByType.isEmpty()) {
1699           Iterator iter = precisionByType.keySet().iterator();
1700           while (iter.hasNext()) {
1701             String annotType = (String) iter.next();
1702             Out.prln(annotType + ": "
1703               + ((Double)precisionByType.get(annotType)).doubleValue()
1704                   /
1705                   ((Integer)prCountByType.get(annotType)).intValue()
1706               + "<P>");
1707           }//while
1708         }
1709         Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
1710 
1711         Out.prln("<H3> Recall </H3>");
1712         if (recallByType != null && !recallByType.isEmpty()) {
1713           Iterator iter = recallByType.keySet().iterator();
1714           while (iter.hasNext()) {
1715             String annotType = (String) iter.next();
1716             Out.prln(annotType + ": "
1717               + ((Double)recallByType.get(annotType)).doubleValue()
1718                   /
1719                   ((Integer)recCountByType.get(annotType)).intValue()
1720               + "<P>");
1721           }//while
1722         }
1723 
1724         Out.prln("Overall recall: " + getRecallAverage()
1725                  + "<P>");
1726      */
1727     if (annotTypes == null) {
1728       Out.prln("No types given for evaluation, cannot obtain precision/recall");
1729       return;
1730     }
1731     Out.prln("<table border=1>");
1732     Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1733              "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1734              "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1735              "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1736     String annotType;
1737     for (int i = 0; i < annotTypes.size(); i++) {
1738       annotType = annotTypes.get(i);
1739       printStatsForType(annotType);
1740     //for
1741     Out.prln("</table>");
1742   // updateStatisticsProc
1743 
1744   protected void printStatsForType(String annotType) {
1745     long correct =
1746             (correctByType.get(annotType== null: correctByType.get(
1747                     annotType).longValue();
1748     long partial =
1749             (partialByType.get(annotType== null: partialByType.get(
1750                     annotType).longValue();
1751     long spurious =
1752             (spurByType.get(annotType== null: spurByType.get(annotType)
1753                     .longValue();
1754     long missing =
1755             (missingByType.get(annotType== null: missingByType.get(
1756                     annotType).longValue();
1757     long actual = correct + partial + spurious;
1758     long possible = correct + partial + missing;
1759     //precision strict is correct/actual
1760     //precision is (correct + 0.5 * partially correct)/actual
1761     double precision = 0d;
1762     if (actual!=0)
1763       precision = (correct + 0.5 * partial/ actual;
1764     
1765     //recall strict is correct/possible
1766     double recall = 0d;
1767     if (possible!=0)
1768       recall = (correct + 0.5 * partial/ possible;
1769     
1770     //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1771     double fmeasure = 0d;
1772     if ((beta * beta * precision+ recall !=0){
1773       fmeasure =
1774         ( (beta * beta + 1* precision * recall)
1775         /
1776         ( (beta * beta * precision+ recall);
1777     }
1778 
1779     long proc_correct = 0;
1780     long proc_partial = 0;
1781     long proc_spurious = 0;
1782     long proc_missing = 0;
1783     long proc_actual = 0;
1784     long proc_possible = 0;
1785     double proc_precision = 0;
1786     double proc_recall = 0;
1787     double proc_fmeasure = 0;
1788 
1789     if (hasProcessed) {
1790       // calculate values for processed
1791       proc_correct = (proc_correctByType.get(annotType== null:
1792                      proc_correctByType.get(annotType).longValue();
1793       proc_partial = (proc_partialByType.get(annotType== null:
1794                      proc_partialByType.get(annotType).longValue();
1795       proc_spurious = (proc_spurByType.get(annotType== null:
1796                       proc_spurByType.get(annotType).longValue();
1797       proc_missing = (proc_missingByType.get(annotType== null:
1798                      proc_missingByType.get(annotType).longValue();
1799       proc_actual = proc_correct + proc_partial + proc_spurious;
1800       proc_possible = proc_correct + proc_partial + proc_missing;
1801       //precision strict is correct/actual
1802       //precision is (correct + 0.5 * partially correct)/actual
1803       proc_precision = (proc_correct + 0.5 * proc_partial/ proc_actual;
1804       //recall strict is correct/possible
1805       proc_recall = (proc_correct + 0.5 * proc_partial/ proc_possible;
1806       //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1807       proc_fmeasure =
1808           ( (beta * beta + 1* proc_precision * proc_recall)
1809           /
1810           ( (beta * beta * proc_precision+ proc_recall);
1811 
1812     }
1813 
1814     // output data
1815     Out.prln("<TR>");
1816     if (hasProcessed)
1817       Out.prln("<TD>" + annotType + "_new" "</TD>");
1818     else
1819       Out.prln("<TD>" + annotType + "</TD>");
1820 
1821     Out.prln("<TD>" + correct + "</TD>");
1822     Out.prln("<TD>" + partial + "</TD>");
1823     Out.prln("<TD>" + missing + "</TD>");
1824     Out.prln("<TD>" + spurious + "</TD>");
1825 
1826     String strPrec = (isMoreInfoMode?
1827                      avgPrint(precision, 4)
1828                      : Double.toString(precision);
1829     String strRec = (isMoreInfoMode?
1830                     avgPrint(recall, 4)
1831                     : Double.toString(recall);
1832     String strFmes = (isMoreInfoMode?
1833                      avgPrint(fmeasure, 4)
1834                      : Double.toString(fmeasure);
1835 
1836     if (hasProcessed && (precision < proc_precision))
1837       Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1838     else if (hasProcessed && (precision > proc_precision))
1839       Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1840     else
1841       Out.prln("<TD>" + strPrec + "</TD>");
1842     if (hasProcessed && (recall < proc_recall))
1843       Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1844     else if (hasProcessed && (recall > proc_recall))
1845       Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1846     else
1847       Out.prln("<TD>" + strRec + "</TD>");
1848     Out.prln("<TD>" + strFmes + "</TD>");
1849     Out.prln("</TR>");
1850 
1851     if (hasProcessed) {
1852       // output data
1853       Out.prln("<TR>");
1854       Out.prln("<TD>" + annotType + "_old" "</TD>");
1855 
1856       Out.prln("<TD>" + proc_correct + "</TD>");
1857       Out.prln("<TD>" + proc_partial + "</TD>");
1858       Out.prln("<TD>" + proc_missing + "</TD>");
1859       Out.prln("<TD>" + proc_spurious + "</TD>");
1860 
1861       String strProcPrec = (isMoreInfoMode?
1862                            avgPrint(proc_precision, 4)
1863                            : Double.toString(proc_precision);
1864       String strProcRec = (isMoreInfoMode?
1865                           avgPrint(proc_recall, 4)
1866                           : Double.toString(proc_recall);
1867       String strProcFmes = (isMoreInfoMode?
1868                            avgPrint(proc_fmeasure, 4)
1869                            : Double.toString(proc_fmeasure);
1870 
1871       if (precision < proc_precision)
1872         Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1873       else if (precision > proc_precision)
1874         Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1875       else
1876         Out.prln("<TD>" + strProcPrec + "</TD>");
1877       if (recall < proc_recall)
1878         Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1879       else if (recall > proc_recall)
1880         Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1881       else
1882         Out.prln("<TD>" + strProcRec + "</TD>");
1883       Out.prln("<TD>" + strProcFmes + "</TD>");
1884       Out.prln("</TR>");
1885     }
1886   //printStatsForType
1887 
1888   //** Print @param value with @param count digits after decimal point */
1889   protected String avgPrint(double value, int count) {
1890     double newvalue;
1891     double power = Math.pow(10, count);
1892     newvalue = Math.round(value * power/ power;
1893     return Double.toString(newvalue);
1894   }
1895 
1896   private double precisionSumCalc = 0;
1897   private double recallSumCalc = 0;
1898   private double fMeasureSumCalc = 0;
1899 
1900   public double getPrecisionAverageCalc() {
1901     return precisionSumCalc;
1902   }
1903 
1904   public double getRecallAverageCalc() {
1905     return recallSumCalc;
1906   }
1907 
1908   public double getFmeasureAverageCalc() {
1909     return fMeasureSumCalc;
1910   }
1911 
1912   protected void calculateAvgTotal() {
1913     long correct, partial, spurious, missing;
1914     long correctSum, partialSum, spuriousSum, missingSum;
1915 
1916     if (annotTypes == null) {
1917       return;
1918     }
1919     correctSum = partialSum = spuriousSum = missingSum = 0;
1920 
1921     String annotType;
1922     for(int i = 0; i < annotTypes.size(); i++) {
1923       annotType = annotTypes.get(i);
1924       correct =
1925               (correctByType.get(annotType== null: correctByType.get(
1926                       annotType).longValue();
1927       partial =
1928               (partialByType.get(annotType== null: partialByType.get(
1929                       annotType).longValue();
1930       spurious =
1931               (spurByType.get(annotType== null: spurByType.get(
1932                       annotType).longValue();
1933       missing =
1934               (missingByType.get(annotType== null: missingByType.get(
1935                       annotType).longValue();
1936       correctSum += correct;
1937       partialSum += partial;
1938       spuriousSum += spurious;
1939       missingSum += missing;
1940     // for
1941 
1942     long actual = correctSum + partialSum + spuriousSum;
1943     long possible = correctSum + partialSum + missingSum;
1944 
1945     if (actual == 0) {
1946       precisionSumCalc = 0;
1947     }
1948     else {
1949       precisionSumCalc = (correctSum + 0.5 * partialSum/ actual;
1950     }
1951 
1952     if (possible == 0) {
1953       recallSumCalc = 0;
1954     }
1955     else {
1956       recallSumCalc = (correctSum + 0.5 * partialSum/ actual;
1957     }
1958 
1959     if (precisionSumCalc == && recallSumCalc == 0) {
1960       fMeasureSumCalc = 0;
1961     }
1962     else {
1963       fMeasureSumCalc =
1964           ( (beta * beta + 1* precisionSumCalc * recallSumCalc)
1965           /
1966           ( (beta * beta * precisionSumCalc+ recallSumCalc);
1967 
1968     }
1969   // calculateAvgTotal
1970 
1971   protected AnnotationDiffer measureDocs(
1972       Document keyDoc, Document respDoc, String annotTypethrows
1973       ResourceInstantiationException {
1974 
1975     if (keyDoc == null || respDoc == null)
1976       return null;
1977 
1978     if (annotSetName != null
1979         && keyDoc.getAnnotations(annotSetName).get(annotType== null)
1980       return null;
1981     else if ( (annotSetName == null || annotSetName.equals(""))
1982              && keyDoc.getAnnotations().get(annotType== null)
1983       return null;
1984 
1985     // create an annotation diff
1986     AnnotationDiffer annotDiffer = new AnnotationDiffer();
1987     // set the feature names set for annotation differ
1988     annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1989     // we need to find the sets
1990     AnnotationSet keys, responses;
1991     if (annotSetName == null || annotSetName.equals("")) {
1992       keys = keyDoc.getAnnotations().get(annotType);
1993       responses = respDoc.getAnnotations().get(annotType);
1994     }
1995     else {
1996       keys = keyDoc.getAnnotations(annotSetName).get(annotType);
1997       responses = respDoc.getAnnotations(outputSetName).get(annotType);
1998     }
1999 
2000     // we have annotation sets so call the annotationDiffer
2001     annotDiffer.calculateDiff(keys, responses);
2002     
2003     return annotDiffer;
2004   // measureDocs
2005 
2006   protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
2007                                   Document keyDoc, Document respDoc,
2008                                   Writer errFileWriter) {
2009     if (errFileWriter == null)return// exit on "no file"
2010 
2011     try {
2012       // extract and store annotations
2013       Comparator<Annotation> comp = new OffsetComparator();
2014       Set<Annotation> sortedSet = new TreeSet<Annotation>(comp);
2015       Set<Annotation> missingSet =
2016           annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
2017       sortedSet.clear();
2018       sortedSet.addAll(missingSet);
2019       storeAnnotations(type + ".miss", sortedSet, keyDoc, errFileWriter);
2020       Set<Annotation> spuriousSet =
2021           annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
2022       sortedSet.clear();
2023       sortedSet.addAll(spuriousSet);
2024       storeAnnotations(type + ".spur", sortedSet, respDoc, errFileWriter);
2025       Set<Annotation> partialSet =
2026           annotDiffer.getAnnotationsOfType(AnnotationDiffer.
2027                                            PARTIALLY_CORRECT_TYPE);
2028       sortedSet.clear();
2029       sortedSet.addAll(partialSet);
2030       storeAnnotations(type + ".part", sortedSet, respDoc, errFileWriter);
2031     }
2032     catch (Exception ex) {
2033       Out.prln("Exception on close of error file " + errFileWriter + ": "
2034                + ex.getMessage());
2035     }
2036   // storeAnnotations
2037 
2038   protected void storeAnnotations(String type, Set<Annotation> set, Document doc,
2039                                   Writer filethrows IOException {
2040 
2041     if (set == null || set.isEmpty())
2042       return;
2043 
2044     Iterator<Annotation> iter = set.iterator();
2045     Annotation ann;
2046     while (iter.hasNext()) {
2047       ann = iter.next();
2048       file.write(type);
2049       file.write(".");
2050       file.write(doc.getContent().toString().substring(
2051           ann.getStartNode().getOffset().intValue(),
2052           ann.getEndNode().getOffset().intValue()));
2053       file.write(".");
2054       file.write(ann.getStartNode().getOffset().toString());
2055       file.write(".");
2056       file.write(ann.getEndNode().getOffset().toString());
2057       file.write("\n");
2058     //while
2059   // storeAnnotations
2060 
2061   protected void printAnnotations(AnnotationDiffer annotDiff,
2062                                   Document keyDoc, Document respDoc) {
2063     Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
2064     Set<Annotation> missingSet =
2065         annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
2066     printAnnotations(missingSet, keyDoc);
2067     Out.prln("<BR>");
2068 
2069     Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
2070     Set<Annotation> spuriousSet =
2071         annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
2072     printAnnotations(spuriousSet, respDoc);
2073     Out.prln("</BR>");
2074 
2075     Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
2076     Set<Annotation> partialSet =
2077         annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
2078     printAnnotations(partialSet, respDoc);
2079   }
2080 
2081   protected void printAnnotations(Set<Annotation> set, Document doc) {
2082     if (set == null || set.isEmpty())
2083       return;
2084 
2085     Iterator<Annotation> iter = set.iterator();
2086     while (iter.hasNext()) {
2087       Annotation ann = iter.next();
2088       Out.prln(
2089           "<B>" +
2090           doc.getContent().toString().substring(
2091           ann.getStartNode().getOffset().intValue(),
2092           ann.getEndNode().getOffset().intValue()) +
2093           "</B>: <I>[" + ann.getStartNode().getOffset() +
2094           "," + ann.getEndNode().getOffset() "]</I>"
2095 //        + "; features" + ann.getFeatures()
2096           );
2097     //while
2098   //printAnnotations
2099 
2100   /**
2101    * The directory from which we should generate/evaluate the corpus
2102    */
2103   private File startDir;
2104   private File currDir;
2105   private static List<String> annotTypes;
2106 
2107   private Controller application = null;
2108   private File applicationFile = null;
2109 
2110   //collect the sum of all precisions and recalls of all docs
2111   //and the number of docs, so I can calculate the average for
2112   //the corpus at the end
2113   private double precisionSum = 0.0;
2114   private double recallSum = 0.0;
2115   private double fMeasureSum = 0.0;
2116   private Map<String,Double> precisionByType = new HashMap<String,Double>();
2117   private Map<String,Integer> prCountByType = new HashMap<String,Integer>();
2118   private Map<String,Double> recallByType = new HashMap<String,Double>();
2119   private Map<String,Integer> recCountByType = new HashMap<String,Integer>();
2120   private Map<String,Double> fMeasureByType = new HashMap<String,Double>();
2121   private Map<String,Integer> fMeasureCountByType = new HashMap<String,Integer>();
2122 
2123   private Map<String,Long> missingByType = new HashMap<String,Long>();
2124   private Map<String,Long> spurByType = new HashMap<String,Long>();
2125   private Map<String,Long> correctByType = new HashMap<String,Long>();
2126   private Map<String,Long> partialByType = new HashMap<String,Long>();
2127 
2128   // statistic for processed
2129   static boolean hasProcessed = false;
2130   private double proc_precisionSum = 0;
2131   private double proc_recallSum = 0;
2132   private double proc_fMeasureSum = 0;
2133   private Map<String,Double> proc_precisionByType = new HashMap<String,Double>();
2134   private Map<String,Integer> proc_prCountByType = new HashMap<String,Integer>();
2135   private Map<String,Double> proc_recallByType = new HashMap<String,Double>();
2136   private Map<String,Integer> proc_recCountByType = new HashMap<String,Integer>();
2137   private Map<String,Double> proc_fMeasureByType = new HashMap<String,Double>();
2138   private Map<String,Integer> proc_fMeasureCountByType = new HashMap<String,Integer>();
2139 
2140   private Map<String,Long> proc_missingByType = new HashMap<String,Long>();
2141   private Map<String,Long> proc_spurByType = new HashMap<String,Long>();
2142   private Map<String,Long> proc_correctByType = new HashMap<String,Long>();
2143   private Map<String,Long> proc_partialByType = new HashMap<String,Long>();
2144 
2145   double beta = 1;
2146 
2147   private int docNumber = 0;
2148 
2149   /**
2150    * If true, the corpus tool will generate the corpus, otherwise it'll
2151    * run in evaluate mode
2152    */
2153   private boolean isGenerateMode = false;
2154 
2155   /**
2156    * If true - show annotations for docs below threshold
2157    */
2158   private boolean isVerboseMode = false;
2159 
2160   /**
2161    * If true - show more info in document table
2162    */
2163   private boolean isMoreInfoMode = false;
2164 
2165   /**
2166    * The list of features used in the AnnotationDiff separated by comma
2167    * Example: "class;inst"
2168    */
2169   private Set<String> diffFeaturesSet;
2170 
2171   /**
2172    * If true, the corpus tool will evaluate stored against the human-marked
2173    * documents
2174    */
2175   private boolean isMarkedStored = false;
2176   private boolean isMarkedClean = false;
2177 
2178   //whether marked are in a DS, not xml
2179   private boolean isMarkedDS = false;
2180 
2181   private String annotSetName = "Key";
2182   private String outputSetName = null;
2183 
2184   private double threshold = 0.5;
2185   private Properties configs = new Properties();
2186   private static int corpusWordCount = 0;
2187 
2188   private String documentEncoding = "";
2189 
2190   /** String to print when wrong command-line args */
2191   private static String usage =
2192       "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
2193       "[-verbose] [-moreinfo] directory-name application";
2194 
2195 }