/*
* BatchProcessApp.java
*
*
* Copyright (c) 2006, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June1991.
*
* A copy of this licence is included in the distribution in the file
* licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
*
* Ian Roberts, March 2006
*
* $Id: BatchProcessApp.java,v 1.5 2006/06/11 19:17:57 ian Exp $
*/
package sheffield.examples;
import gate.Document;
import gate.Corpus;
import gate.CorpusController;
import gate.AnnotationSet;
import gate.Gate;
import gate.Factory;
import gate.util.*;
import gate.util.persistence.PersistenceManager;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.io.File;
import java.io.FileOutputStream;
import java.io.BufferedOutputStream;
import java.io.OutputStreamWriter;
/**
* This class ilustrates how to do simple batch processing with GATE. It loads
* an application from a .gapp file (created using "Save application state" in
* the GATE GUI), and runs the contained application over one or more files.
* The results are written out to XML files, either in GateXML format (all
* annotation sets preserved, as in "save as XML" in the GUI), or with inline
* XML tags taken from the default annotation set (as in "save preserving
* format"). In this example, the output file names are simply the input file
* names with ".out.xml" appended.
*
* To keep the example simple, we do not do any exception handling - any error
* will cause the process to abort.
*/
public class BatchProcessApp {
/**
* The main entry point. First we parse the command line options (see
* usage() method for details), then we take all remaining command line
* parameters to be file names to process. Each file is loaded, processed
* using the application and the results written to the output file
* (inputFile.out.xml).
*/
public static void main(String[] args) throws Exception {
parseCommandLine(args);
// initialise GATE - this must be done before calling any GATE APIs
Gate.init();
// load the saved application
CorpusController application =
(CorpusController)PersistenceManager.loadObjectFromFile(gappFile);
// Create a Corpus to use. We recycle the same Corpus object for each
// iteration. The string parameter to newCorpus() is simply the
// GATE-internal name to use for the corpus. It has no particular
// significance.
Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
application.setCorpus(corpus);
// process the files one by one
for(int i = firstFile; i < args.length; i++) {
// load the document (using the specified encoding if one was given)
File docFile = new File(args[i]);
System.out.print("Processing document " + docFile + "...");
Document doc = Factory.newDocument(docFile.toURL(), encoding);
// put the document in the corpus
corpus.add(doc);
// run the application
application.execute();
// remove the document from the corpus again
corpus.clear();
String docXMLString = null;
// if we want to just write out specific annotation types, we must
// extract the annotations into a Set
if(annotTypesToWrite != null) {
// Create a temporary Set to hold the annotations we wish to write out
Set annotationsToWrite = new HashSet();
// we only extract annotations from the default (unnamed) AnnotationSet
// in this example
AnnotationSet defaultAnnots = doc.getAnnotations();
Iterator annotTypesIt = annotTypesToWrite.iterator();
while(annotTypesIt.hasNext()) {
// extract all the annotations of each requested type and add them to
// the temporary set
AnnotationSet annotsOfThisType =
defaultAnnots.get((String)annotTypesIt.next());
if(annotsOfThisType != null) {
annotationsToWrite.addAll(annotsOfThisType);
}
}
// create the XML string using these annotations
docXMLString = doc.toXml(annotationsToWrite);
}
// otherwise, just write out the whole document as GateXML
else {
docXMLString = doc.toXml();
}
// Release the document, as it is no longer needed
Factory.deleteResource(doc);
// output the XML to <inputFile>.out.xml
String outputFileName = docFile.getName() + ".out.xml";
File outputFile = new File(docFile.getParentFile(), outputFileName);
// Write output files using the same encoding as the original
FileOutputStream fos = new FileOutputStream(outputFile);
BufferedOutputStream bos = new BufferedOutputStream(fos);
OutputStreamWriter out;
if(encoding == null) {
out = new OutputStreamWriter(bos);
}
else {
out = new OutputStreamWriter(bos, encoding);
}
out.write(docXMLString);
out.close();
System.out.println("done");
} // for each file
System.out.println("All done");
} // void main(String[] args)
/**
* Parse command line options.
*/
private static void parseCommandLine(String[] args) throws Exception {
int i;
// iterate over all options (arguments starting with '-')
for(i = 0; i < args.length && args[i].charAt(0) == '-'; i++) {
switch(args[i].charAt(1)) {
// -a type = write out annotations of type a.
case 'a':
if(annotTypesToWrite == null) annotTypesToWrite = new ArrayList();
annotTypesToWrite.add(args[++i]);
break;
// -g gappFile = path to the saved application
case 'g':
gappFile = new File(args[++i]);
break;
// -e encoding = character encoding for documents
case 'e':
encoding = args[++i];
break;
default:
System.err.println("Unrecognised option " + args[i]);
usage();
}
}
// set index of the first non-option argument, which we take as the first
// file to process
firstFile = i;
// sanity check other arguments
if(gappFile == null) {
System.err.println("No .gapp file specified");
usage();
}
}
/**
* Print a usage message and exit.
*/
private static final void usage() {
System.err.println(
"Usage:\n" +
" java sheffield.examples.BatchProcessApp -g <gappFile> [-e encoding]\n" +
" [-a annotType] [-a annotType] file1 file2 ... fileN\n" +
"\n" +
"-g gappFile : (required) the path to the saved application state we are\n" +
" to run over the given documents. This application must be\n" +
" a \"corpus pipeline\" or a \"conditional corpus pipeline\".\n" +
"\n" +
"-e encoding : (optional) the character encoding of the source documents.\n" +
" If not specified, the platform default encoding (currently\n" +
" \"" + System.getProperty("file.encoding") + "\") is assumed.\n" +
"\n" +
"-a type : (optional) write out just the annotations of this type as\n" +
" inline XML tags. Multiple -a options are allowed, and\n" +
" annotations of all the specified types will be output.\n" +
" This is the equivalent of \"save preserving format\" in the\n" +
" GATE GUI. If no -a option is given the whole of each\n" +
" processed document will be output as GateXML (the equivalent\n" +
" of \"save as XML\")."
);
System.exit(1);
}
/** Index of the first non-option argument on the command line. */
private static int firstFile = 0;
/** Path to the saved application file. */
private static File gappFile = null;
/**
* List of annotation types to write out. If null, write everything as
* GateXML.
*/
private static List annotTypesToWrite = null;
/**
* The character encoding to use when loading the docments. If null, the
* platform default encoding is used.
*/
private static String encoding = null;
}