|
CorpusSaver |
|
1 /* 2 * CorpusSaver.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 22/Nov/2001 12 * 13 * $Id: CorpusSaver.java,v 1.8 2003/07/10 13:53:43 kalina Exp $ 14 */ 15 16 package gate.util; 17 18 import java.util.*; 19 import java.io.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.persist.*; 25 import java.net.*; 26 import java.text.NumberFormat; 27 28 import gate.creole.tokeniser.*; 29 import gate.creole.gazetteer.*; 30 import gate.creole.splitter.*; 31 import gate.creole.orthomatcher.*; 32 import gate.creole.annotransfer.*; 33 import gate.creole.annotdelete.*; 34 35 public class CorpusSaver { 36 37 private static final boolean DEBUG = true; 38 39 public CorpusSaver() { 40 } 41 42 public void init() { 43 if (saveMode) { 44 File path = new File(dsPath); 45 try { 46 ds = Factory.openDataStore("gate.persist.SerialDataStore", 47 path.toURL().toString()); 48 } catch (Exception ex) { 49 throw new gate.util.GateRuntimeException(ex.getMessage()); 50 } 51 52 try { 53 Corpus corpus = Factory.newCorpus("bnc"); 54 LanguageResource lr = ds.adopt(corpus, null); 55 ds.sync(lr); 56 theCorpus = (Corpus) lr; 57 } catch (Exception ex) { 58 throw new GateRuntimeException(ex.getMessage()); 59 } 60 } 61 62 if (processMode) 63 initPRs(); 64 65 } 66 67 public void initPRs() { 68 try { 69 if (applicationFile == null) 70 Out.prln("Application not set!"); 71 Out.prln("App file is: " + applicationFile.getAbsolutePath()); 72 application = (Controller) gate.util.persistence.PersistenceManager 73 .loadObjectFromFile(applicationFile); 74 } catch (Exception ex) { 75 throw new GateRuntimeException("Corpus Saver: "+ex.getMessage()); 76 } 77 }//initPRs 78 79 public void execute() { 80 execute(startDir); 81 try { 82 if (saveMode) { 83 ds.sync(theCorpus); 84 Factory.deleteResource(theCorpus); 85 if (ds != null) 86 ds.close(); 87 } 88 if (application != null) { 89 Iterator iter = new ArrayList(application.getPRs()).iterator(); 90 while (iter.hasNext()) 91 Factory.deleteResource((Resource) iter.next()); 92 } 93 } catch (Exception ex) { 94 throw new GateRuntimeException(ex.getMessage()); 95 } 96 } 97 98 public void execute(File dir) { 99 if (dir == null || (saveMode && ds == null)) 100 return; 101 //first set the current directory to be the given one 102 currDir = dir; 103 Out.prln("Processing directory: " + currDir); 104 105 ArrayList files = new ArrayList(); 106 ArrayList dirs = new ArrayList(); 107 File[] dirArray = currDir.listFiles(); 108 for (int i = 0; i < dirArray.length; i++) { 109 if (dirArray[i].isDirectory()) 110 dirs.add(dirArray[i]); 111 else if (dirArray[i].isFile()) 112 files.add(dirArray[i]); 113 } 114 115 saveFiles(files); 116 117 //if no more subdirs left, return 118 if (dirs.isEmpty()) 119 return; 120 121 //there are more subdirectories to traverse, so iterate through 122 for (int j = 0; j < dirs.size(); j++) 123 execute((File) dirs.get(j)); 124 125 }//execute(dir) 126 127 128 public static void main(String[] args) throws GateException { 129 Gate.init(); 130 131 CorpusSaver corpusSaver1 = new CorpusSaver(); 132 133 if(args.length < 2) 134 throw new GateException("usage: [-process|-process-only] source_directory datastore_path application"); 135 int i = 0; 136 while (i < args.length && args[i].startsWith("-")) { 137 if(args[i].equals("-process")) { 138 Out.prln("Processing and saving the corpus enabled. <P>"); 139 corpusSaver1.setProcessMode(true); 140 } else if (args[i].equals("-process_only")) { 141 Out.prln("Processing only enabled. <P>"); 142 corpusSaver1.setSaveMode(false); 143 corpusSaver1.setProcessMode(true); 144 } 145 i++; //just ignore the option, which we do not recognise 146 }//while 147 148 String dirName = args[i]; 149 File dir = new File(dirName); 150 if (!dir.isDirectory()) 151 throw new GateRuntimeException("Corpus directory should be " 152 + "provided as a parameter"); 153 154 if(i+1 >= args.length) 155 throw new GateRuntimeException("Datastore path not provided"); 156 157 if (corpusSaver1.getSaveMode()) { 158 String storagePath = args[i + 1]; 159 File storage = new File(storagePath); 160 if (!storage.isDirectory()) 161 throw new GateRuntimeException("Please provide path to an existing " 162 + "GATE serial datastore"); 163 corpusSaver1.setDSPath(storagePath); 164 } 165 166 //get the last argument which is the application 167 if (corpusSaver1.getProcessMode()) { 168 i++; 169 String appName = args[i]; 170 File appFile = new File(appName); 171 if (!appFile.isFile()) 172 throw new GateException("Please provide an existing GATE application"); 173 else 174 corpusSaver1.setApplicationFile(appFile); 175 } 176 177 Out.prln("Initialising GATE please wait..."); 178 corpusSaver1.init(); 179 corpusSaver1.setStartDir(dir); 180 Out.prln("Processing..."); 181 double timeBefore = System.currentTimeMillis(); 182 corpusSaver1.execute(); 183 double timeAfter = System.currentTimeMillis(); 184 Out.prln("Done in " + 185 NumberFormat.getInstance().format((timeAfter-timeBefore)/1000) 186 + " seconds"); 187 188 } 189 190 public void setStartDir(File newDir) { 191 startDir = newDir; 192 } 193 194 public void setProcessMode(boolean mode) { 195 processMode = mode; 196 } 197 198 public boolean getProcessMode() { 199 return processMode; 200 } 201 202 public void setSaveMode(boolean mode) { 203 saveMode = mode; 204 } 205 206 public boolean getSaveMode() { 207 return saveMode; 208 } 209 210 public void setDSPath(String path){ 211 dsPath = path; 212 } 213 214 public void setApplicationFile(File newAppFile) { 215 applicationFile = newAppFile; 216 } 217 218 219 protected void saveFiles(List files) { 220 if (files==null || files.isEmpty() || 221 (saveMode && (theCorpus == null || ds == null))) 222 return; 223 224 for(int i=0; i<files.size(); i++) { 225 try { 226 Document doc = Factory.newDocument(((File)files.get(i)).toURL()); 227 doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURL().toString())); 228 Out.prln("Storing document: " + doc.getName()); 229 //first process it with ANNIE if in process mode 230 if (processMode) 231 processDocument(doc); 232 233 //then store it in the DS and add to corpus 234 if (saveMode) { 235 LanguageResource lr = ds.adopt(doc, null); 236 theCorpus.add(lr); 237 theCorpus.unloadDocument( (Document) lr); 238 239 if (lr != doc) 240 Factory.deleteResource(lr); 241 } 242 Factory.deleteResource(doc); 243 } catch (Exception ex) { 244 throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage()); 245 } 246 }//for 247 }//saveFiles 248 249 protected void processDocument(Document doc) { 250 try { 251 if (application instanceof CorpusController) { 252 Corpus tempCorpus = Factory.newCorpus("temp"); 253 tempCorpus.add(doc); 254 ((CorpusController)application).setCorpus(tempCorpus); 255 application.execute(); 256 Factory.deleteResource(tempCorpus); 257 tempCorpus = null; 258 } else { 259 Iterator iter = application.getPRs().iterator(); 260 while (iter.hasNext()) 261 ((ProcessingResource) iter.next()).setParameterValue("document", doc); 262 application.execute(); 263 } 264 } catch (ResourceInstantiationException ex) { 265 throw new RuntimeException("Error executing application: " 266 + ex.getMessage()); 267 } catch (ExecutionException ex) { 268 throw new RuntimeException("Error executing application: " 269 + ex.getMessage()); 270 } 271 } 272 273 274 /** 275 * The directory from which we should generate/evaluate the corpus 276 */ 277 private File startDir; 278 private File currDir; 279 280 private DataStore ds; 281 private Corpus theCorpus; 282 private String annotSetName = "NE"; 283 private String dsPath = "d:\\bnc"; 284 private Controller application = null; 285 private File applicationFile = null; 286 287 private boolean processMode = false; 288 private boolean saveMode = true; 289 } 290
|
CorpusSaver |
|