|
ProfilePRs |
|
1 /* 2 * ProfilePRs.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 04/10/2001 12 * 13 * $Id: ProfilePRs.java,v 1.4 2002/03/06 17:15:39 kalina Exp $ 14 */ 15 16 package gate.creole; 17 18 import java.util.*; 19 import java.io.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.creole.tokeniser.*; 25 import gate.creole.gazetteer.*; 26 import gate.creole.splitter.*; 27 import gate.creole.orthomatcher.*; 28 import gate.util.profile.*; 29 //import java.text.NumberFormat; 30 31 /** 32 * This class provides a main function that: 33 * <UL> 34 * <LI> 35 * initialises the GATE library, and creates all PRs 36 * <LI> 37 * takes a directory name as argument 38 * <LI> 39 * for each .html file in that directory: 40 * <BR> create a GATE document from the file 41 * <BR> run the PRs on the document 42 * <BR> dump some statistics in the end 43 * </UL> 44 */ 45 public class ProfilePRs { 46 47 /** String to print when wrong command-line args */ 48 private static String usage = 49 "usage: ProfilePRs [-dir directory-name | file(s)]"; 50 51 private static double totalDocLength = 0; 52 private static int docs = 0; 53 private static Profiler prof = new Profiler(); 54 private static double maxDocLength = 0; 55 56 /** Main function */ 57 public static void main(String[] args) throws Exception { 58 // say "hi" 59 Out.prln("processing command line arguments"); 60 61 // check we have a directory name or list of files 62 List inputFiles = null; 63 if(args.length < 1) throw new GateException(usage); 64 if(args[0].equals("-dir")) { // list all the files in the dir 65 if(args.length < 2) throw new GateException(usage); 66 File dir = new File(args[1]); 67 File[] filesArray = dir.listFiles(); 68 if(filesArray == null) 69 throw new GateException( 70 dir.getPath() + " is not a directory; " + usage 71 ); 72 inputFiles = Arrays.asList(filesArray); 73 } else { // all args should be file names 74 inputFiles = new ArrayList(); 75 for(int i = 0; i < args.length; i++) 76 inputFiles.add(new File(args[i])); 77 } 78 79 prof.initRun("Measuring performance on directory " + args[1]); 80 // prof.enable(false); 81 // prof.enableGCCalling(false); 82 83 // initialise GATE 84 prof.checkPoint("Before GATE.init()"); 85 Gate.init(); 86 //tell GATE we're in batch mode 87 // gate.Main.batchMode = true; 88 89 90 // create some processing resources 91 prof.checkPoint("Before creating the processing resources"); 92 93 //create a default tokeniser 94 FeatureMap params = Factory.newFeatureMap(); 95 DefaultTokeniser tokeniser = (DefaultTokeniser) Factory.createResource( 96 "gate.creole.tokeniser.DefaultTokeniser", params); 97 prof.checkPoint("Tokeniser initialised"); 98 99 //create a default gazetteer 100 params = Factory.newFeatureMap(); 101 DefaultGazetteer gaz = (DefaultGazetteer) Factory.createResource( 102 "gate.creole.gazetteer.DefaultGazetteer", params); 103 prof.checkPoint("Gazetteer initialised"); 104 105 //create a splitter 106 params = Factory.newFeatureMap(); 107 SentenceSplitter splitter = (SentenceSplitter) Factory.createResource( 108 "gate.creole.splitter.SentenceSplitter", params); 109 prof.checkPoint("Sentence splitter initialised"); 110 111 //create a tagger 112 params = Factory.newFeatureMap(); 113 POSTagger tagger = (POSTagger) Factory.createResource( 114 "gate.creole.POSTagger", params); 115 prof.checkPoint("POSTagger initialised"); 116 117 //create a grammar 118 params = Factory.newFeatureMap(); 119 ANNIETransducer transducer = (ANNIETransducer) Factory.createResource( 120 "gate.creole.ANNIETransducer", params); 121 prof.checkPoint("Grammars initialised"); 122 123 //create an orthomatcher 124 params = Factory.newFeatureMap(); 125 OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource( 126 "gate.creole.orthomatcher.OrthoMatcher", params); 127 prof.checkPoint("Orthomatcher initialised"); 128 129 130 // for each document 131 // create a gate doc 132 // set as the document for hte PRs 133 // run the PRs 134 // dump output from the doc 135 // delete the doc 136 Out.prln("\nLooping on input files list"); 137 Iterator filesIter = inputFiles.iterator(); 138 docs = inputFiles.size(); 139 int fileNo=0; 140 while(filesIter.hasNext()) { 141 File inFile = (File) filesIter.next(); // the current file 142 fileNo++; 143 144 // set the source URL parameter to a "file:..." URL string 145 params.clear(); 146 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, inFile.toURL().toExternalForm()); 147 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 148 149 // create the document 150 Document doc = (Document) Factory.createResource( 151 "gate.corpora.DocumentImpl", params 152 ); 153 totalDocLength += doc.getContent().size().longValue(); 154 155 if (maxDocLength < doc.getContent().size().longValue()) 156 maxDocLength = doc.getContent().size().longValue(); 157 158 // set the document param on the PRs 159 tokeniser.setDocument(doc); 160 prof.checkPoint("Processing file " + inFile.getPath() + 161 ", #" + fileNo + "/" + docs, new String[0], true, false, false); 162 tokeniser.execute(); 163 prof.checkPoint("", new String[] {"Tokenizer", "Processing"}, false, false, false); 164 165 //run gazetteer 166 gaz.setDocument(doc); 167 gaz.execute(); 168 prof.checkPoint("", new String[] {"Gazettier", "Processing"}, false, false, false); 169 170 //run splitter 171 splitter.setDocument(doc); 172 splitter.execute(); 173 prof.checkPoint("", new String[] {"Splitter", "Processing"}, false, false, false); 174 175 //run the tagger 176 tagger.setDocument(doc); 177 tagger.execute(); 178 prof.checkPoint("", new String[] {"Tagger", "Processing"}, false, false, false); 179 180 //run the transducer 181 transducer.setDocument(doc); 182 transducer.execute(); 183 prof.checkPoint("", new String[] {"JAPE grammars", "Processing"}, false, false, false); 184 185 // run the orthomatcher 186 orthomatcher.setDocument(doc); 187 orthomatcher.execute(); 188 prof.checkPoint("", new String[] {"Orthomatcher", "Processing"}, false, false, false); 189 190 // make the doc a candidate for garbage collection 191 Factory.deleteResource(doc); 192 193 } // input files loop 194 195 prof.checkPoint("Done!"); 196 197 totalDocLength = (double) totalDocLength/1024; 198 Out.prln("\nTotal KBytes processed: " + (long)totalDocLength); 199 Out.prln("\nMax document size in bytes: " + (long)maxDocLength + 200 " (" + (long) maxDocLength/1024 + " Kb)"); 201 202 203 prof.printCategAvg("Processing", docs, totalDocLength, "kb"); 204 prof.printCategAvg("Tokenizer", docs, totalDocLength, "kb"); 205 prof.printCategAvg("Gazettier", docs, totalDocLength, "kb"); 206 prof.printCategAvg("Splitter", docs, totalDocLength, "kb"); 207 prof.printCategAvg("Tagger", docs, totalDocLength, "kb"); 208 prof.printCategAvg("JAPE grammars", docs, totalDocLength, "kb"); 209 prof.printCategAvg("Orthomatcher", docs, totalDocLength, "kb"); 210 } // main 211 212 213 } // class ProfilePRs 214
|
ProfilePRs |
|