gate.util
Class CorpusBenchmarkTool

java.lang.Object
  extended bygate.util.CorpusBenchmarkTool

public class CorpusBenchmarkTool
extends Object


Field Summary
private  String annotSetName
           
private static List annotTypes
           
private  Controller application
           
private  File applicationFile
           
(package private)  double beta
           
private static String CLEAN_DIR_NAME
           
private  Properties configs
           
private static int corpusWordCount
           
private  HashMap correctByType
           
private  File currDir
           
private static String CVS_DIR_NAME
           
private static boolean DEBUG
           
private  Set diffFeaturesSet
          The list of features used in the AnnotationDiff separated by comma Example: "class;inst"
private  int docNumber
           
private  String documentEncoding
           
private static String ERROR_DIR_NAME
           
private  HashMap fMeasureByType
           
private  HashMap fMeasureCountByType
           
private  double fMeasureSum
           
(package private) static boolean hasProcessed
           
private  boolean isGenerateMode
          If true, the corpus tool will generate the corpus, otherwise it'll run in evaluate mode
private  boolean isMarkedClean
           
private  boolean isMarkedDS
           
private  boolean isMarkedStored
          If true, the corpus tool will evaluate stored against the human-marked documents
private  boolean isMoreInfoMode
          If true - show more info in document table
private  boolean isVerboseMode
          If true - show annotations for docs below threshold
private static String MARKED_DIR_NAME
           
private  HashMap missingByType
           
private  String outputSetName
           
private  HashMap partialByType
           
private  HashMap prCountByType
           
private  HashMap precisionByType
           
private  double precisionSum
           
private  HashMap proc_correctByType
           
private  HashMap proc_fMeasureByType
           
private  HashMap proc_fMeasureCountByType
           
private  double proc_fMeasureSum
           
private  HashMap proc_missingByType
           
private  HashMap proc_partialByType
           
private  HashMap proc_prCountByType
           
private  HashMap proc_precisionByType
           
private  double proc_precisionSum
           
private  HashMap proc_recallByType
           
private  double proc_recallSum
           
private  HashMap proc_recCountByType
           
private  HashMap proc_spurByType
           
private static String PROCESSED_DIR_NAME
           
private  HashMap recallByType
           
private  double recallSum
           
private  HashMap recCountByType
           
private  HashMap spurByType
           
private  File startDir
          The directory from which we should generate/evaluate the corpus
private  double threshold
           
private static String usage
          String to print when wrong command-line args
 
Constructor Summary
CorpusBenchmarkTool()
           
 
Method Summary
protected  int countWords(Document annotDoc)
          Count all Token.kind=word annotations in the document
protected  void evaluateAllThree(Document persDoc, Document cleanDoc, Document markedDoc, File errDir)
           
protected  void evaluateCorpus(File fileDir, File processedDir, File markedDir, File errorDir)
           
protected  void evaluateDocuments(Document persDoc, Document cleanDoc, Document markedDoc, File errDir)
           
protected  void evaluateMarkedClean(File markedDir, File cleanDir, File errDir)
           
protected  void evaluateMarkedStored(File markedDir, File storedDir, File errDir)
           
protected  void evaluateTwoDocs(Document keyDoc, Document respDoc, File errDir)
           
 void execute()
           
 void execute(File dir)
           
protected  void generateCorpus(File fileDir, File outputDir)
           
 Set getDiffFeaturesList()
           
 boolean getGenerateMode()
           
 boolean getMarkedClean()
           
 boolean getMarkedDS()
           
 boolean getMarkedStored()
           
 boolean getMoreInfo()
           
 double getPrecisionAverage()
          Returns the average precision over the entire set of processed documents.
 double getPrecisionAverageProc()
          For processed documents
 double getRecallAverage()
          Returns the average recall over the entire set of processed documents.
 double getRecallAverageProc()
           
 File getStartDirectory()
           
 double getThreshold()
           
 boolean getVerboseMode()
           
 void init()
           
 void initPRs()
           
 boolean isGenerateMode()
           
static void main(String[] args)
           
protected  AnnotationDiff measureDocs(Document keyDoc, Document respDoc, String annotType)
           
protected  void printAnnotations(AnnotationDiff annotDiff, Document keyDoc, Document respDoc)
           
protected  void printAnnotations(Set set, Document doc)
           
 void printStatistics()
           
protected  void printStatsForType(String annotType)
           
protected  void printTableHeader()
           
protected  void processDocument(Document doc)
           
 void setApplicationFile(File newAppFile)
           
 void setDiffFeaturesList(Set features)
           
 void setGenerateMode(boolean mode)
           
 void setMarkedClean(boolean mode)
           
 void setMarkedDS(boolean mode)
           
 void setMarkedStored(boolean mode)
           
 void setMoreInfo(boolean mode)
           
 void setStartDirectory(File dir)
           
 void setThreshold(double newValue)
           
 void setVerboseMode(boolean mode)
           
protected  void storeAnnotations(String type, AnnotationDiff annotDiff, Document keyDoc, Document respDoc, FileWriter errFileWriter)
           
protected  void storeAnnotations(String type, Set set, Document doc, FileWriter file)
           
 void unloadPRs()
           
protected  void updateStatistics(AnnotationDiff annotDiff, String annotType)
           
protected  void updateStatisticsProc(AnnotationDiff annotDiff, String annotType)
          Update statistics for processed documents The same procedure as updateStatistics with different hashTables
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

MARKED_DIR_NAME

private static final String MARKED_DIR_NAME
See Also:
Constant Field Values

CLEAN_DIR_NAME

private static final String CLEAN_DIR_NAME
See Also:
Constant Field Values

CVS_DIR_NAME

private static final String CVS_DIR_NAME
See Also:
Constant Field Values

PROCESSED_DIR_NAME

private static final String PROCESSED_DIR_NAME
See Also:
Constant Field Values

ERROR_DIR_NAME

private static final String ERROR_DIR_NAME
See Also:
Constant Field Values

DEBUG

private static final boolean DEBUG
See Also:
Constant Field Values

startDir

private File startDir
The directory from which we should generate/evaluate the corpus


currDir

private File currDir

annotTypes

private static List annotTypes

application

private Controller application

applicationFile

private File applicationFile

precisionSum

private double precisionSum

recallSum

private double recallSum

fMeasureSum

private double fMeasureSum

precisionByType

private HashMap precisionByType

prCountByType

private HashMap prCountByType

recallByType

private HashMap recallByType

recCountByType

private HashMap recCountByType

fMeasureByType

private HashMap fMeasureByType

fMeasureCountByType

private HashMap fMeasureCountByType

missingByType

private HashMap missingByType

spurByType

private HashMap spurByType

correctByType

private HashMap correctByType

partialByType

private HashMap partialByType

hasProcessed

static boolean hasProcessed

proc_precisionSum

private double proc_precisionSum

proc_recallSum

private double proc_recallSum

proc_fMeasureSum

private double proc_fMeasureSum

proc_precisionByType

private HashMap proc_precisionByType

proc_prCountByType

private HashMap proc_prCountByType

proc_recallByType

private HashMap proc_recallByType

proc_recCountByType

private HashMap proc_recCountByType

proc_fMeasureByType

private HashMap proc_fMeasureByType

proc_fMeasureCountByType

private HashMap proc_fMeasureCountByType

proc_missingByType

private HashMap proc_missingByType

proc_spurByType

private HashMap proc_spurByType

proc_correctByType

private HashMap proc_correctByType

proc_partialByType

private HashMap proc_partialByType

beta

double beta

docNumber

private int docNumber

isGenerateMode

private boolean isGenerateMode
If true, the corpus tool will generate the corpus, otherwise it'll run in evaluate mode


isVerboseMode

private boolean isVerboseMode
If true - show annotations for docs below threshold


isMoreInfoMode

private boolean isMoreInfoMode
If true - show more info in document table


diffFeaturesSet

private Set diffFeaturesSet
The list of features used in the AnnotationDiff separated by comma Example: "class;inst"


isMarkedStored

private boolean isMarkedStored
If true, the corpus tool will evaluate stored against the human-marked documents


isMarkedClean

private boolean isMarkedClean

isMarkedDS

private boolean isMarkedDS

annotSetName

private String annotSetName

outputSetName

private String outputSetName

threshold

private double threshold

configs

private Properties configs

corpusWordCount

private static int corpusWordCount

documentEncoding

private String documentEncoding

usage

private static String usage
String to print when wrong command-line args

Constructor Detail

CorpusBenchmarkTool

public CorpusBenchmarkTool()
Method Detail

initPRs

public void initPRs()

unloadPRs

public void unloadPRs()

execute

public void execute()

init

public void init()

execute

public void execute(File dir)

main

public static void main(String[] args)
                 throws GateException
Throws:
GateException

setGenerateMode

public void setGenerateMode(boolean mode)

getGenerateMode

public boolean getGenerateMode()

getVerboseMode

public boolean getVerboseMode()

setVerboseMode

public void setVerboseMode(boolean mode)

setMoreInfo

public void setMoreInfo(boolean mode)

getMoreInfo

public boolean getMoreInfo()

setDiffFeaturesList

public void setDiffFeaturesList(Set features)

getDiffFeaturesList

public Set getDiffFeaturesList()

setMarkedStored

public void setMarkedStored(boolean mode)

getMarkedStored

public boolean getMarkedStored()

setMarkedClean

public void setMarkedClean(boolean mode)

getMarkedClean

public boolean getMarkedClean()

setMarkedDS

public void setMarkedDS(boolean mode)

getMarkedDS

public boolean getMarkedDS()

setApplicationFile

public void setApplicationFile(File newAppFile)

getPrecisionAverage

public double getPrecisionAverage()
Returns the average precision over the entire set of processed documents.

If the tool has been evaluating the original documents against the previously-stored automatically annotated ones, then the precision will be the average precision on those two sets.

If the tool was run in -marked mode, i.e., was evaluating the stored automatically processed ones against the human-annotated ones, then the precision will be the average precision on those two sets of documents.


getRecallAverage

public double getRecallAverage()
Returns the average recall over the entire set of processed documents.

If the tool has been evaluating the original documents against the previously-stored automatically annotated ones, then the recall will be the average recall on those two sets.

If the tool was run in -marked mode, i.e., was evaluating the stored automatically processed ones against the human-annotated ones, then the recall will be the average recall on those two sets of documents.


getPrecisionAverageProc

public double getPrecisionAverageProc()
For processed documents


getRecallAverageProc

public double getRecallAverageProc()

isGenerateMode

public boolean isGenerateMode()

getThreshold

public double getThreshold()

setThreshold

public void setThreshold(double newValue)

getStartDirectory

public File getStartDirectory()

setStartDirectory

public void setStartDirectory(File dir)

generateCorpus

protected void generateCorpus(File fileDir,
                              File outputDir)

evaluateCorpus

protected void evaluateCorpus(File fileDir,
                              File processedDir,
                              File markedDir,
                              File errorDir)

evaluateMarkedStored

protected void evaluateMarkedStored(File markedDir,
                                    File storedDir,
                                    File errDir)

evaluateMarkedClean

protected void evaluateMarkedClean(File markedDir,
                                   File cleanDir,
                                   File errDir)

processDocument

protected void processDocument(Document doc)

evaluateDocuments

protected void evaluateDocuments(Document persDoc,
                                 Document cleanDoc,
                                 Document markedDoc,
                                 File errDir)
                          throws ResourceInstantiationException
Throws:
ResourceInstantiationException

countWords

protected int countWords(Document annotDoc)
Count all Token.kind=word annotations in the document


evaluateAllThree

protected void evaluateAllThree(Document persDoc,
                                Document cleanDoc,
                                Document markedDoc,
                                File errDir)
                         throws ResourceInstantiationException
Throws:
ResourceInstantiationException

evaluateTwoDocs

protected void evaluateTwoDocs(Document keyDoc,
                               Document respDoc,
                               File errDir)
                        throws ResourceInstantiationException
Throws:
ResourceInstantiationException

printTableHeader

protected void printTableHeader()

updateStatistics

protected void updateStatistics(AnnotationDiff annotDiff,
                                String annotType)

updateStatisticsProc

protected void updateStatisticsProc(AnnotationDiff annotDiff,
                                    String annotType)
Update statistics for processed documents The same procedure as updateStatistics with different hashTables


printStatistics

public void printStatistics()

printStatsForType

protected void printStatsForType(String annotType)

measureDocs

protected AnnotationDiff measureDocs(Document keyDoc,
                                     Document respDoc,
                                     String annotType)
                              throws ResourceInstantiationException
Throws:
ResourceInstantiationException

storeAnnotations

protected void storeAnnotations(String type,
                                AnnotationDiff annotDiff,
                                Document keyDoc,
                                Document respDoc,
                                FileWriter errFileWriter)

storeAnnotations

protected void storeAnnotations(String type,
                                Set set,
                                Document doc,
                                FileWriter file)
                         throws IOException
Throws:
IOException

printAnnotations

protected void printAnnotations(AnnotationDiff annotDiff,
                                Document keyDoc,
                                Document respDoc)

printAnnotations

protected void printAnnotations(Set set,
                                Document doc)