Batch.java |
1 /* 2 * Batch.java - transducer class 3 * 4 * Copyright (c) 1998-2004, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 10/08/98 12 * 13 * $Id: Batch.java,v 1.36 2004/07/21 17:10:07 akshay Exp $ 14 * 15 * DEVELOPER NOTES: 16 * 17 * This is one that got away; the relation between constructors, 18 * initTransducer and parseTransducer are totally screwy and get worse 19 * every time I add something (e.g. support for resource loading). 20 * We should probably junk this whole thing and start again.... 21 */ 22 23 package gate.jape; 24 25 import java.net.URL; 26 import java.util.Iterator; 27 import java.util.Vector; 28 29 import gate.*; 30 import gate.creole.ExecutionException; 31 import gate.event.ProgressListener; 32 import gate.event.StatusListener; 33 import gate.util.Err; 34 import gate.util.Out; 35 36 /** Batch processing of JAPE transducers against documents or collections. 37 * Construction will parse or deserialise a transducer as required. 38 */ 39 public class Batch implements JapeConstants { 40 /** Debug flag */ 41 private static final boolean DEBUG = false; 42 43 /** The name of the transducer file, a .jape or .ser. */ 44 // private String japeFileName; 45 46 /** The URL that points to a .jape file */ 47 private URL japeURL; 48 49 /**The encoding used for reading the grammar file(s)*/ 50 private String encoding; 51 52 /** The JAPE transducer. */ 53 private Transducer transducer; 54 55 /** A stream connected to the JAPE file (often null). */ 56 // private InputStream japeStream = null; 57 58 /** Create non-initialised instance (private, used in main). */ 59 private Batch() { } 60 61 /** Create a fully initialised instance. 62 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer 63 * file. This may be an absolute path, or may a .jar 64 * that lives somewhere on the classpath. 65 */ 66 public Batch(URL url, String encoding) throws JapeException { 67 this.japeURL = url; 68 this.encoding = encoding; 69 parseJape(); 70 if(transducer != null){ 71 transducer.addStatusListener(new StatusListener(){ 72 public void statusChanged(String text){ 73 fireStatusChanged(text); 74 } 75 }); 76 77 transducer.addProgressListener(new ProgressListener(){ 78 public void progressChanged(int value){ 79 fireProgressChanged(value); 80 } 81 82 public void processFinished(){ 83 fireProcessFinished(); 84 } 85 }); 86 } 87 88 } // full init constructor 89 90 public Batch(URL url, String encoding, StatusListener sListener) 91 throws JapeException { 92 93 this.addStatusListener(sListener); 94 this.japeURL = url; 95 this.encoding = encoding; 96 parseJape(); 97 if(transducer != null){ 98 transducer.addStatusListener(new StatusListener(){ 99 public void statusChanged(String text){ 100 fireStatusChanged(text); 101 } 102 }); 103 104 transducer.addProgressListener(new ProgressListener(){ 105 public void progressChanged(int value){ 106 fireProgressChanged(value); 107 } 108 109 public void processFinished(){ 110 fireProcessFinished(); 111 } 112 }); 113 } 114 } // full init constructor 115 116 /** 117 * Notifies this PR that it should stop its execution as soon as possible. 118 */ 119 public synchronized void interrupt(){ 120 transducer.interrupt(); 121 } 122 /** Create a fully initialised instance. 123 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer 124 * file. This may be an absolute path, or may a .jar 125 * that lives somewhere on the classpath. 126 */ 127 /* 128 public Batch(String japeFileName) throws JapeException { 129 this.japeFileName = japeFileName; 130 initTransducer(); 131 } // full init constructor 132 */ 133 /* 134 public Batch(String japeFileName, StatusListener sListener) 135 throws JapeException { 136 this.japeFileName = japeFileName; 137 this.addStatusListener(sListener); 138 initTransducer(); 139 } // full init constructor 140 */ 141 142 /** Create a fully initialised instance from an InputStream connected 143 * to the JAPE file. 144 */ 145 /* 146 public Batch(InputStream japeStream) throws JapeException { 147 if(japeStream == null) 148 throw new JapeException( 149 "attempt to create a batch parser with null input stream" 150 ); 151 this.japeFileName = "stream"; 152 this.japeStream = japeStream; 153 initTransducer(); 154 } // full init constructor 155 */ 156 /** Create a fully initialised instance from a resource path and resource 157 * name. 158 */ 159 /* 160 public Batch(String resPath, String resName) throws JapeException { 161 fromResource = true; 162 this.japeFileName = resName; 163 this.resPath = resPath; 164 initTransducer(); 165 } // full init constructor 166 */ 167 168 /** Get the transducer. */ 169 public Transducer getTransducer() { return transducer; } 170 171 /** Instantiate transducer member as necessary. */ 172 /* 173 private void initTransducer() 174 throws JapeException { 175 if(fromResource) { 176 parseJape(resPath, japeFileName); 177 } else if(japeFileName.endsWith(".ser") || japeFileName.endsWith(".SER")) 178 deserialiseJape(new File(japeFileName)); 179 else if(japeFileName.endsWith(".jape") || japeFileName.endsWith(".JAPE")) 180 parseJape(); 181 else if(japeFileName.endsWith(".jar") || japeFileName.endsWith(".JAR")) 182 deserialiseJape(); 183 else if(japeFileName.equals("stream")) 184 parseJape(japeStream); 185 else 186 throw new JapeException( 187 "unknown file type (not .jape, .ser or .jar):" + japeFileName 188 ); 189 if(transducer != null) transducer.addStatusListener(new StatusListener() { 190 public void statusChanged(String text){ 191 fireStatusChangedEvent(text); 192 } 193 }); 194 } 195 */ 196 /** Parse a jape file from {@link #japeURL} and store the transducer. */ 197 private void parseJape() throws JapeException { 198 try { 199 gate.jape.parser.ParseCpsl parser = 200 new gate.jape.parser.ParseCpsl(japeURL, encoding); 201 202 StatusListener listener = null; 203 listener = new StatusListener(){ 204 public void statusChanged(String text){ 205 fireStatusChanged(text); 206 } 207 }; 208 parser.addStatusListener(listener); 209 transducer = parser.MultiPhaseTransducer(); 210 parser.removeStatusListener(listener); 211 //the call to finish needs to be handled from here now as it 212 //was removed from the .jj file 213 transducer.addStatusListener(listener); 214 transducer.finish(); 215 transducer.removeStatusListener(listener); 216 217 } catch (gate.jape.parser.ParseException e) { 218 throw new 219 JapeException("Batch: error parsing transducer: " + e.getMessage()); 220 } catch (java.io.IOException e) { 221 throw new 222 JapeException("Batch: couldn't open JAPE file: " + e.getMessage()); 223 } 224 } // parseJape 225 226 /** Parse a jape file from an InputStream and store the transducer. */ 227 /* 228 private void parseJape(InputStream japeStream) throws JapeException { 229 try { 230 gate.jape.parser.ParseCpsl parser = 231 new gate.jape.parser.ParseCpsl(japeFileName, japeStream); 232 transducer = parser.MultiPhaseTransducer(); 233 } catch (gate.jape.parser.ParseException e) { 234 throw new 235 JapeException("Batch: error parsing transducer: " + e.getMessage()); 236 } catch (java.io.IOException e) { 237 throw new 238 JapeException("Batch: couldn't read JAPE stream: " + e.getMessage()); 239 } 240 } // parseJape(InputStream) 241 */ 242 /** Parse a jape file from a resource and store the transducer. */ 243 /* 244 private void parseJape(String resPath, String resName) throws JapeException { 245 try { 246 gate.jape.parser.ParseCpsl parser = 247 new gate.jape.parser.ParseCpsl(resPath, resName); 248 transducer = parser.MultiPhaseTransducer(); 249 } catch (gate.jape.parser.ParseException e) { 250 throw new 251 JapeException("Batch: error parsing transducer: " + e.getMessage()); 252 } catch (java.io.IOException e) { 253 throw new 254 JapeException("Batch: couldn't read JAPE resource: " + e.getMessage()); 255 } 256 } // parseJape(resPath, resName) 257 */ 258 259 /** Deserialise from a .ser file. */ 260 /* 261 private void deserialiseJape(File japeFile) throws JapeException { 262 263 // set up a file input stream 264 FileInputStream japeInputStream = null; 265 try { 266 japeInputStream = new FileInputStream(japeFile.getPath()); 267 } catch (IOException e) { 268 throw new JapeException( 269 "Can't read from " + japeFile.getPath() + ": " + e.getMessage() 270 ); 271 } 272 273 // call the input stream deserialise method 274 deserialiseJape(japeInputStream); 275 } // deserialiseJape(File) 276 */ 277 /** Deserialise from a JAR file. */ 278 /* 279 private void deserialiseJape() throws JapeException { 280 // find the jar from CLASSPATH 281 //SearchPath classPath = 282 // new SearchPath(System.getProperty("java.class.path"), "."); 283 File jarFile = new File(japeFileName); //classPath.getFile(japeFileName); 284 if(jarFile == null) 285 throw new JapeException("Batch: can't find " + japeFileName); 286 287 // get a byte array input stream with the .ser in out of the jar file 288 JarFile jar = null; 289 BufferedInputStream japeInputStream = null; 290 try { 291 jar = new JarFile(jarFile.getPath()); 292 japeInputStream = new BufferedInputStream( 293 jar.getInputStream(jar.getJarEntry(jarNameToSerName(japeFileName))) 294 ); 295 } catch(IOException e) { 296 throw new JapeException("couldn't read jar file " + japeFileName); 297 } 298 299 300 // call the input stream deserialise method 301 deserialiseJape(japeInputStream); 302 } // deserialiseJape() 303 */ 304 /** Create a transducer from an object input stream (deserialisation). */ 305 /* 306 private void deserialiseJape(InputStream japeInputStream) 307 throws JapeException { 308 try { 309 ObjectInputStream ois = new ObjectInputStream(japeInputStream); 310 transducer = (Transducer) ois.readObject(); 311 ois.close(); 312 japeInputStream.close(); // redundant? 313 } catch (IOException e) { 314 throw new JapeException( 315 "Batch: can't deserialise InputStream (1): " + e.getMessage() 316 ); 317 } catch (ClassNotFoundException e) { 318 throw new JapeException( 319 "Batch: can't deserialise InputStream (2): " + e.getMessage() 320 ); 321 } 322 } // deserialise(OIS) 323 */ 324 /** Create a .ser name from a .jar name. */ 325 /* 326 private String jarNameToSerName(String jarName) { 327 return jarName.substring(0, jarName.length() - 4) + ".ser"; 328 } // jarNameToSerName 329 */ 330 331 /** Process the given collection. */ 332 public void transduce(Corpus coll) throws JapeException, ExecutionException { 333 // for each doc run the transducer 334 Iterator iter = coll.iterator(); 335 while(iter.hasNext()) { 336 Document doc = (Document) iter.next(); 337 // transducer.transduce(doc); 338 transduce(doc, doc.getAnnotations(), doc.getAnnotations()); 339 } 340 } // transduce(coll) 341 342 /** Process a single document. */ 343 public void transduce(Document doc) throws JapeException, ExecutionException { 344 transducer.transduce(doc, doc.getAnnotations(), doc.getAnnotations()); 345 } // transduce(doc) 346 347 /** Process a single document. */ 348 public void transduce(Document doc, AnnotationSet inputAS, 349 AnnotationSet outputAS) throws JapeException, 350 ExecutionException { 351 //no need to transduce empty document 352 if (inputAS == null || inputAS.isEmpty()) 353 return; 354 transducer.transduce(doc, inputAS, outputAS); 355 356 } // transduce(doc) 357 358 /** Process a single text. */ 359 /* 360 public Document transduce(String text) throws JapeException { 361 Document doc = null; 362 try { 363 doc = Factory.newDocument(text); 364 } catch (ResourceInstantiationException e) { 365 throw new JapeException(e.toString()); 366 } 367 transducer.transduce(doc, doc.getAnnotations()); 368 return doc; 369 } // transduce(text) 370 */ 371 /** Process a single file. */ 372 /* 373 public Document transduce(File textFile) throws JapeException { 374 String text = null; 375 try { 376 text = gate.util.Files.getString(textFile); 377 } catch(IOException e) { throw new JapeException(e.toString()); } 378 return transduce(text); 379 } // transduce(textFile) 380 */ 381 /** Process a set of files. */ 382 /* 383 public Corpus transduce(String[] textFileNames) throws JapeException { 384 Corpus coll = null; 385 try { 386 coll = Factory.newCorpus("JAPE batch corpus"); 387 Document doc = null; 388 for(int i = 0; i < textFileNames.length; i++) { 389 doc = Factory.newDocument(textFileNames[i]); 390 doc.setFeatures(Factory.newFeatureMap()); 391 /*coll.createDocument( 392 textFileNames[i], 393 null, // the text - should get read from disk 394 new AnnotationSetImpl(doc), 395 Factory.newFeatureMap(), 396 Document.COPIED 397 );*/ 398 /* 399 transducer.transduce(doc, doc.getAnnotations()); 400 } 401 } catch(ResourceInstantiationException e) { 402 throw new JapeException(e.toString()); 403 } 404 return coll; 405 } // transduce(textFileNames) 406 */ 407 /** This is where it all happens. This is <I>the</I> place to be. Take 408 * your summer holidays here. Visit on Saturday nights. Buy a season 409 * ticket from <CODE>www.programmer.gone.insane.com</CODE>. 410 * <P> 411 * Takes a .jape/.jar/.ser 412 * file name (-j option) which is assumed to hold a pattern 413 * grammar for a multi-phase transducer, and a collection 414 * name (-c option) or a list of files. As needed it then parses and 415 * compiles the transducer, then transduces all the documents in the 416 * collection and saves it to disk. 417 */ 418 public static void main(String args[]) { 419 /* 420 // oh great bug in the sky give us this day our daily fuckup 421 //gate.util.Debug.setDebug(true); 422 //gate.util.Debug.setDebug(Rule.class, true); 423 //gate.util.Debug.setDebug(LeftHandSide.class, true); 424 //gate.util.Debug.setDebug(BasicPatternElement.class, true); 425 //gate.util.Debug.setDebug(AnnotationSet.class, true); 426 427 // The persistent name of the collection. 428 String persCollName = null;; 429 430 // The collection to process. 431 Corpus collection = null; 432 433 // create one of us 434 Batch batch = new Batch(); 435 436 // process the options 437 int i = 0; 438 for( ; i<args.length; i++) { 439 if(args[i].equals("-c") && ++i < args.length) // -c = coll name 440 persCollName = args[i]; 441 else if(args[i].equals("-j") && ++i < args.length)// -j = transducer name 442 batch.japeFileName = args[i]; 443 else if(args[i].equals("-v")) // -v = verbose 444 batch.setVerbose(true); 445 else if(args[i].startsWith("-")) 446 batch.usage("unknown option " + args[i]); 447 else 448 break; 449 } // for each arg 450 451 // file name list 452 String[] fileNames = null; 453 if(args.length > i) { 454 fileNames = new String[args.length - i]; 455 for(int j = 0; i<args.length; j++, i++) 456 fileNames[j] = args[i]; 457 } 458 459 // did they give valid options? 460 if(batch.japeFileName == null) 461 batch.usage("you must supply a transducer name"); 462 if(fileNames != null && persCollName != null) 463 batch.usage("can't read a collection AND process a file list"); 464 465 // parse the transducer or bomb 466 batch.message("parsing the transducer"); 467 try { batch.initTransducer(); } 468 catch(JapeException e) { 469 batch.usage("oops: " + e.toString()); 470 } 471 472 Corpus coll = null; 473 if(persCollName != null) { // we got a collection name, not a list of files 474 475 // open the collection or bomb 476 coll = null; 477 batch.message("opening the collection"); 478 try { 479 coll = Factory.newCorpus(persCollName); 480 } catch(ResourceInstantiationException e) { 481 batch.usage("oops (x): " + e); 482 } 483 484 // transduce 485 batch.message("calling transducer"); 486 try { batch.transduce(coll); } 487 catch(JapeException e) { 488 batch.usage("oops (1): " + e.toString()); 489 } 490 491 // save to disk 492 batch.message("saving the collection"); 493 batch.usage("couldn't sync coll "); 494 495 // we got a list of files, not a collection 496 } else { 497 batch.message("transducing transient collection"); 498 try { 499 coll = batch.transduce(fileNames); 500 } catch(JapeException e) { 501 batch.usage("oops (2): " + e.toString()); 502 } 503 } 504 505 // we won! we won! we can smash up all the computers now! 506 batch.message("done"); 507 //System.exit(0); 508 */ 509 } // main 510 511 512 /** Whether to print progress messages or not. */ 513 private boolean verbose = false; 514 515 /** Set verbosity. */ 516 public void setVerbose(boolean turtleSoup) { verbose = turtleSoup; } 517 518 /** You got something wrong, dumbo. */ 519 public void usage(String errorMessage) { 520 String usageMessage = 521 "usage: java gate.jape.Batch.main [-v] " + 522 "-j japefile(.ser|.jape|.jar) " + 523 "(-c CollectionName | filenames)"; 524 525 Err.println(errorMessage); 526 Err.println(usageMessage); 527 // System.exit(1); 528 529 } // usage 530 531 /** Hello? Anybody there?? */ 532 public void message(String mess) { 533 if(verbose) Out.println("Batch: " + mess); 534 } // message 535 536 public void setFeatures(gate.FeatureMap newFeatures) { 537 features = newFeatures; 538 } 539 public gate.FeatureMap getFeatures() { 540 return features; 541 } 542 public synchronized void removeProgressListener(ProgressListener l) { 543 if (progressListeners != null && progressListeners.contains(l)) { 544 Vector v = (Vector) progressListeners.clone(); 545 v.removeElement(l); 546 progressListeners = v; 547 } 548 } 549 public synchronized void addProgressListener(ProgressListener l) { 550 Vector v = progressListeners == null ? new Vector(2) : (Vector) progressListeners.clone(); 551 if (!v.contains(l)) { 552 v.addElement(l); 553 progressListeners = v; 554 } 555 } 556 557 //ProcessProgressReporter implementation ends here 558 559 /** Are we initialising from a resource? */ 560 // private boolean fromResource = false; 561 562 /** Path to the resources tree */ 563 // private String resPath = null; 564 565 566 private gate.FeatureMap features; 567 private transient Vector progressListeners; 568 private transient Vector statusListeners; 569 private boolean enableDebugging; 570 571 protected void fireProgressChanged(int e) { 572 if (progressListeners != null) { 573 Vector listeners = progressListeners; 574 int count = listeners.size(); 575 for (int i = 0; i < count; i++) { 576 ((ProgressListener) listeners.elementAt(i)).progressChanged(e); 577 } 578 } 579 } 580 protected void fireProcessFinished() { 581 if (progressListeners != null) { 582 Vector listeners = progressListeners; 583 int count = listeners.size(); 584 for (int i = 0; i < count; i++) { 585 ((ProgressListener) listeners.elementAt(i)).processFinished(); 586 } 587 } 588 } 589 public synchronized void removeStatusListener(StatusListener l) { 590 if (statusListeners != null && statusListeners.contains(l)) { 591 Vector v = (Vector) statusListeners.clone(); 592 v.removeElement(l); 593 statusListeners = v; 594 } 595 } 596 public synchronized void addStatusListener(StatusListener l) { 597 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone(); 598 if (!v.contains(l)) { 599 v.addElement(l); 600 statusListeners = v; 601 } 602 } 603 protected void fireStatusChanged(String e) { 604 if (statusListeners != null) { 605 Vector listeners = statusListeners; 606 int count = listeners.size(); 607 for (int i = 0; i < count; i++) { 608 ((StatusListener) listeners.elementAt(i)).statusChanged(e); 609 } 610 } 611 } 612 613 /** 614 * Sets the ontology to be used by the transducers 615 * @param ontology 616 */ 617 public void setOntology(gate.creole.ontology.Ontology ontology) { 618 transducer.setOntology(ontology); 619 } 620 public boolean isEnableDebugging() { 621 return enableDebugging; 622 } 623 public void setEnableDebugging(boolean enableDebugging) { 624 this.enableDebugging = enableDebugging; 625 //propagate 626 if(transducer != null) transducer.setEnableDebugging(enableDebugging); 627 } 628 629 630 /* 631 private void writeObject(ObjectOutputStream oos) throws IOException { 632 Out.prln("writing batch"); 633 oos.defaultWriteObject(); 634 Out.prln("finished writing batch"); 635 } // writeObject 636 */ 637 638 } // class Batch 639 640