1   /*
2    *  Batch.java - transducer class
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 10/08/98
12   *
13   *  $Id: Batch.java,v 1.36 2004/07/21 17:10:07 akshay Exp $
14   *
15   *  DEVELOPER NOTES:
16   *
17   *  This is one that got away; the relation between constructors,
18   *  initTransducer and parseTransducer are totally screwy and get worse
19   *  every time I add something (e.g. support for resource loading).
20   *  We should probably junk this whole thing and start again....
21   */
22  
23  package gate.jape;
24  
25  import java.net.URL;
26  import java.util.Iterator;
27  import java.util.Vector;
28  
29  import gate.*;
30  import gate.creole.ExecutionException;
31  import gate.event.ProgressListener;
32  import gate.event.StatusListener;
33  import gate.util.Err;
34  import gate.util.Out;
35  
36  /** Batch processing of JAPE transducers against documents or collections.
37    * Construction will parse or deserialise a transducer as required.
38    */
39  public class Batch implements JapeConstants {
40    /** Debug flag */
41    private static final boolean DEBUG = false;
42  
43    /** The name of the transducer file, a .jape or .ser. */
44  //  private String japeFileName;
45  
46    /** The URL that points to a .jape file */
47    private URL japeURL;
48  
49    /**The encoding used for reading the grammar file(s)*/
50    private String encoding;
51  
52    /** The JAPE transducer. */
53    private Transducer transducer;
54  
55    /** A stream connected to the JAPE file (often null). */
56  //  private InputStream japeStream = null;
57  
58    /** Create non-initialised instance (private, used in main). */
59    private Batch() { }
60  
61    /** Create a fully initialised instance.
62      * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer
63      * file. This may be an absolute path, or may a .jar
64      * that lives somewhere on the classpath.
65      */
66    public Batch(URL url, String encoding) throws JapeException {
67      this.japeURL = url;
68      this.encoding =  encoding;
69      parseJape();
70      if(transducer != null){
71        transducer.addStatusListener(new StatusListener(){
72          public void statusChanged(String text){
73            fireStatusChanged(text);
74          }
75        });
76  
77        transducer.addProgressListener(new ProgressListener(){
78          public void progressChanged(int value){
79            fireProgressChanged(value);
80          }
81  
82          public void processFinished(){
83            fireProcessFinished();
84          }
85        });
86      }
87  
88    } // full init constructor
89  
90    public Batch(URL url, String encoding, StatusListener sListener)
91           throws JapeException {
92  
93      this.addStatusListener(sListener);
94      this.japeURL = url;
95      this.encoding =  encoding;
96      parseJape();
97      if(transducer != null){
98        transducer.addStatusListener(new StatusListener(){
99          public void statusChanged(String text){
100           fireStatusChanged(text);
101         }
102       });
103 
104       transducer.addProgressListener(new ProgressListener(){
105         public void progressChanged(int value){
106           fireProgressChanged(value);
107         }
108 
109         public void processFinished(){
110           fireProcessFinished();
111         }
112       });
113     }
114   } // full init constructor
115 
116   /**
117    * Notifies this PR that it should stop its execution as soon as possible.
118    */
119   public synchronized void interrupt(){
120     transducer.interrupt();
121   }
122   /** Create a fully initialised instance.
123     * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer
124     * file. This may be an absolute path, or may a .jar
125     * that lives somewhere on the classpath.
126     */
127 /*
128   public Batch(String japeFileName) throws JapeException {
129     this.japeFileName = japeFileName;
130     initTransducer();
131   } // full init constructor
132 */
133 /*
134   public Batch(String japeFileName, StatusListener sListener)
135                                                         throws JapeException {
136     this.japeFileName = japeFileName;
137     this.addStatusListener(sListener);
138     initTransducer();
139   } // full init constructor
140 */
141 
142   /** Create a fully initialised instance from an InputStream connected
143     * to the JAPE file.
144     */
145 /*
146   public Batch(InputStream japeStream) throws JapeException {
147     if(japeStream == null)
148       throw new JapeException(
149         "attempt to create a batch parser with null input stream"
150       );
151     this.japeFileName = "stream";
152     this.japeStream = japeStream;
153     initTransducer();
154   } // full init constructor
155 */
156   /** Create a fully initialised instance from a resource path and resource
157     * name.
158     */
159 /*
160   public Batch(String resPath, String resName) throws JapeException {
161     fromResource = true;
162     this.japeFileName = resName;
163     this.resPath = resPath;
164     initTransducer();
165   } // full init constructor
166 */
167 
168   /** Get the transducer. */
169   public Transducer getTransducer() { return transducer; }
170 
171   /** Instantiate transducer member as necessary. */
172 /*
173   private void initTransducer()
174   throws JapeException {
175     if(fromResource) {
176       parseJape(resPath, japeFileName);
177     } else if(japeFileName.endsWith(".ser") || japeFileName.endsWith(".SER"))
178       deserialiseJape(new File(japeFileName));
179     else if(japeFileName.endsWith(".jape") || japeFileName.endsWith(".JAPE"))
180       parseJape();
181     else if(japeFileName.endsWith(".jar") || japeFileName.endsWith(".JAR"))
182       deserialiseJape();
183     else if(japeFileName.equals("stream"))
184       parseJape(japeStream);
185     else
186       throw new JapeException(
187         "unknown file type (not .jape, .ser or .jar):" + japeFileName
188       );
189     if(transducer != null) transducer.addStatusListener(new StatusListener() {
190       public void statusChanged(String text){
191         fireStatusChangedEvent(text);
192       }
193     });
194   }
195 */
196   /** Parse a jape file from {@link #japeURL} and store the transducer. */
197   private void parseJape() throws JapeException {
198     try {
199       gate.jape.parser.ParseCpsl parser =
200         new gate.jape.parser.ParseCpsl(japeURL, encoding);
201 
202       StatusListener listener = null;
203       listener = new StatusListener(){
204         public void statusChanged(String text){
205           fireStatusChanged(text);
206         }
207       };
208       parser.addStatusListener(listener);
209       transducer = parser.MultiPhaseTransducer();
210       parser.removeStatusListener(listener);
211       //the call to finish needs to be handled from here now as it
212       //was removed from the .jj file
213       transducer.addStatusListener(listener);
214       transducer.finish();
215       transducer.removeStatusListener(listener);
216 
217     } catch (gate.jape.parser.ParseException e) {
218       throw new
219         JapeException("Batch: error parsing transducer: " + e.getMessage());
220     } catch (java.io.IOException e) {
221       throw new
222         JapeException("Batch: couldn't open JAPE file: " + e.getMessage());
223     }
224   } // parseJape
225 
226   /** Parse a jape file from an InputStream and store the transducer. */
227 /*
228   private void parseJape(InputStream japeStream) throws JapeException {
229     try {
230       gate.jape.parser.ParseCpsl parser =
231         new gate.jape.parser.ParseCpsl(japeFileName, japeStream);
232       transducer = parser.MultiPhaseTransducer();
233     } catch (gate.jape.parser.ParseException e) {
234       throw new
235         JapeException("Batch: error parsing transducer: " + e.getMessage());
236     } catch (java.io.IOException e) {
237       throw new
238         JapeException("Batch: couldn't read JAPE stream: " + e.getMessage());
239     }
240   } // parseJape(InputStream)
241 */
242   /** Parse a jape file from a resource and store the transducer. */
243 /*
244   private void parseJape(String resPath, String resName) throws JapeException {
245     try {
246       gate.jape.parser.ParseCpsl parser =
247         new gate.jape.parser.ParseCpsl(resPath, resName);
248       transducer = parser.MultiPhaseTransducer();
249     } catch (gate.jape.parser.ParseException e) {
250       throw new
251         JapeException("Batch: error parsing transducer: " + e.getMessage());
252     } catch (java.io.IOException e) {
253       throw new
254         JapeException("Batch: couldn't read JAPE resource: " + e.getMessage());
255     }
256   } // parseJape(resPath, resName)
257 */
258 
259   /** Deserialise from a .ser file. */
260 /*
261   private void deserialiseJape(File japeFile) throws JapeException {
262 
263     // set up a file input stream
264     FileInputStream japeInputStream = null;
265     try {
266       japeInputStream = new FileInputStream(japeFile.getPath());
267     } catch (IOException e) {
268       throw new JapeException(
269         "Can't read from " + japeFile.getPath() + ": " + e.getMessage()
270       );
271     }
272 
273     // call the input stream deserialise method
274     deserialiseJape(japeInputStream);
275   } // deserialiseJape(File)
276 */
277   /** Deserialise from a JAR file. */
278 /*
279   private void deserialiseJape() throws JapeException {
280     // find the jar from CLASSPATH
281     //SearchPath classPath =
282     //  new SearchPath(System.getProperty("java.class.path"), ".");
283     File jarFile = new File(japeFileName); //classPath.getFile(japeFileName);
284     if(jarFile == null)
285       throw new JapeException("Batch: can't find " + japeFileName);
286 
287     // get a byte array input stream with the .ser in out of the jar file
288     JarFile jar = null;
289     BufferedInputStream japeInputStream = null;
290     try {
291       jar = new JarFile(jarFile.getPath());
292       japeInputStream = new BufferedInputStream(
293         jar.getInputStream(jar.getJarEntry(jarNameToSerName(japeFileName)))
294       );
295     } catch(IOException e) {
296       throw new JapeException("couldn't read jar file " + japeFileName);
297     }
298 
299 
300     // call the input stream deserialise method
301     deserialiseJape(japeInputStream);
302   } // deserialiseJape()
303 */
304   /** Create a transducer from an object input stream (deserialisation). */
305 /*
306   private void deserialiseJape(InputStream japeInputStream)
307   throws JapeException {
308     try {
309       ObjectInputStream ois = new ObjectInputStream(japeInputStream);
310       transducer = (Transducer) ois.readObject();
311       ois.close();
312       japeInputStream.close(); // redundant?
313     } catch (IOException e) {
314       throw new JapeException(
315         "Batch: can't deserialise InputStream (1): " + e.getMessage()
316       );
317     } catch (ClassNotFoundException e) {
318       throw new JapeException(
319         "Batch: can't deserialise InputStream (2): " + e.getMessage()
320       );
321     }
322   } // deserialise(OIS)
323 */
324   /** Create a .ser name from a .jar name. */
325 /*
326   private String jarNameToSerName(String jarName) {
327     return jarName.substring(0, jarName.length() - 4) + ".ser";
328   } // jarNameToSerName
329 */
330 
331   /** Process the given collection. */
332   public void transduce(Corpus coll) throws JapeException, ExecutionException {
333     // for each doc run the transducer
334     Iterator iter = coll.iterator();
335     while(iter.hasNext()) {
336       Document doc = (Document) iter.next();
337       // transducer.transduce(doc);
338       transduce(doc, doc.getAnnotations(), doc.getAnnotations());
339     }
340   } // transduce(coll)
341 
342   /** Process a single document. */
343   public void transduce(Document doc) throws JapeException, ExecutionException {
344     transducer.transduce(doc, doc.getAnnotations(), doc.getAnnotations());
345   } // transduce(doc)
346 
347   /** Process a single document. */
348   public void transduce(Document doc, AnnotationSet inputAS,
349                         AnnotationSet outputAS) throws JapeException,
350                                                        ExecutionException {
351     //no need to transduce empty document
352     if (inputAS == null || inputAS.isEmpty())
353       return;
354     transducer.transduce(doc, inputAS, outputAS);
355 
356   } // transduce(doc)
357 
358   /** Process a single text. */
359 /*
360   public Document transduce(String text) throws JapeException {
361     Document doc = null;
362     try {
363       doc = Factory.newDocument(text);
364     } catch (ResourceInstantiationException e) {
365       throw new JapeException(e.toString());
366     }
367     transducer.transduce(doc, doc.getAnnotations());
368     return doc;
369   } // transduce(text)
370 */
371   /** Process a single file. */
372 /*
373   public Document transduce(File textFile) throws JapeException {
374     String text = null;
375     try {
376       text = gate.util.Files.getString(textFile);
377     } catch(IOException e) { throw new JapeException(e.toString()); }
378     return transduce(text);
379   } // transduce(textFile)
380 */
381   /** Process a set of files. */
382 /*
383   public Corpus transduce(String[] textFileNames) throws JapeException {
384     Corpus coll = null;
385     try {
386       coll = Factory.newCorpus("JAPE batch corpus");
387       Document doc = null;
388       for(int i = 0; i < textFileNames.length; i++) {
389           doc = Factory.newDocument(textFileNames[i]);
390           doc.setFeatures(Factory.newFeatureMap());
391           /*coll.createDocument(
392             textFileNames[i],
393             null, // the text - should get read from disk
394             new AnnotationSetImpl(doc),
395             Factory.newFeatureMap(),
396             Document.COPIED
397           );*/
398 /*
399         transducer.transduce(doc, doc.getAnnotations());
400       }
401     } catch(ResourceInstantiationException e) {
402       throw new JapeException(e.toString());
403     }
404     return coll;
405   } // transduce(textFileNames)
406 */
407   /** This is where it all happens. This is <I>the</I> place to be. Take
408     * your summer holidays here. Visit on Saturday nights. Buy a season
409     * ticket from <CODE>www.programmer.gone.insane.com</CODE>.
410     * <P>
411     * Takes a .jape/.jar/.ser
412     *  file name (-j option) which is assumed to hold a pattern
413     * grammar for a multi-phase transducer, and a collection
414     * name (-c option) or a list of files. As needed it then parses and
415     * compiles the transducer, then transduces all the documents in the
416     * collection and saves it to disk.
417     */
418   public static void main(String args[]) {
419 /*
420     // oh great bug in the sky give us this day our daily fuckup
421     //gate.util.Debug.setDebug(true);
422     //gate.util.Debug.setDebug(Rule.class, true);
423     //gate.util.Debug.setDebug(LeftHandSide.class, true);
424     //gate.util.Debug.setDebug(BasicPatternElement.class, true);
425     //gate.util.Debug.setDebug(AnnotationSet.class, true);
426 
427     // The persistent name of the collection.
428     String persCollName = null;;
429 
430     // The collection to process.
431     Corpus collection = null;
432 
433     // create one of us
434     Batch batch = new Batch();
435 
436     // process the options
437     int i = 0;
438     for( ; i<args.length; i++) {
439       if(args[i].equals("-c") && ++i < args.length) // -c = coll name
440         persCollName = args[i];
441       else if(args[i].equals("-j") && ++i < args.length)// -j = transducer name
442         batch.japeFileName = args[i];
443       else if(args[i].equals("-v")) // -v = verbose
444         batch.setVerbose(true);
445       else if(args[i].startsWith("-"))
446         batch.usage("unknown option " + args[i]);
447       else
448         break;
449     } // for each arg
450 
451     // file name list
452     String[] fileNames = null;
453     if(args.length > i) {
454       fileNames = new String[args.length - i];
455       for(int j = 0; i<args.length; j++, i++)
456         fileNames[j] = args[i];
457     }
458 
459     // did they give valid options?
460     if(batch.japeFileName == null)
461       batch.usage("you must supply a transducer name");
462     if(fileNames != null && persCollName != null)
463       batch.usage("can't read a collection AND process a file list");
464 
465     // parse the transducer or bomb
466     batch.message("parsing the transducer");
467     try { batch.initTransducer(); }
468     catch(JapeException e) {
469       batch.usage("oops: " + e.toString());
470     }
471 
472     Corpus coll = null;
473     if(persCollName != null) { // we got a collection name, not a list of files
474 
475       // open the collection or bomb
476       coll = null;
477       batch.message("opening the collection");
478       try {
479         coll = Factory.newCorpus(persCollName);
480       } catch(ResourceInstantiationException e) {
481         batch.usage("oops (x): " + e);
482       }
483 
484       // transduce
485       batch.message("calling transducer");
486       try { batch.transduce(coll); }
487       catch(JapeException e) {
488         batch.usage("oops (1): " + e.toString());
489       }
490 
491       // save to disk
492       batch.message("saving the collection");
493       batch.usage("couldn't sync coll ");
494 
495     // we got a list of files, not a collection
496     } else {
497       batch.message("transducing transient collection");
498       try {
499         coll = batch.transduce(fileNames);
500       } catch(JapeException e) {
501         batch.usage("oops (2): " + e.toString());
502       }
503     }
504 
505     // we won! we won! we can smash up all the computers now!
506     batch.message("done");
507     //System.exit(0);
508 */
509   } // main
510 
511 
512   /** Whether to print progress messages or not. */
513   private boolean verbose = false;
514 
515   /** Set verbosity. */
516   public void setVerbose(boolean turtleSoup) { verbose = turtleSoup; }
517 
518   /** You got something wrong, dumbo. */
519   public void usage(String errorMessage) {
520     String usageMessage =
521       "usage: java gate.jape.Batch.main [-v] " +
522         "-j japefile(.ser|.jape|.jar) " +
523         "(-c CollectionName | filenames)";
524 
525     Err.println(errorMessage);
526     Err.println(usageMessage);
527     // System.exit(1);
528 
529   } // usage
530 
531   /** Hello? Anybody there?? */
532   public void message(String mess) {
533     if(verbose) Out.println("Batch: " + mess);
534   } // message
535 
536   public void setFeatures(gate.FeatureMap newFeatures) {
537     features = newFeatures;
538   }
539   public gate.FeatureMap getFeatures() {
540     return features;
541   }
542   public synchronized void removeProgressListener(ProgressListener l) {
543     if (progressListeners != null && progressListeners.contains(l)) {
544       Vector v = (Vector) progressListeners.clone();
545       v.removeElement(l);
546       progressListeners = v;
547     }
548   }
549   public synchronized void addProgressListener(ProgressListener l) {
550     Vector v = progressListeners == null ? new Vector(2) : (Vector) progressListeners.clone();
551     if (!v.contains(l)) {
552       v.addElement(l);
553       progressListeners = v;
554     }
555   }
556 
557   //ProcessProgressReporter implementation ends here
558 
559   /** Are we initialising from a resource? */
560 //  private boolean fromResource = false;
561 
562   /** Path to the resources tree */
563 //  private String resPath = null;
564 
565 
566   private gate.FeatureMap features;
567   private transient Vector progressListeners;
568   private transient Vector statusListeners;
569   private boolean enableDebugging;
570 
571   protected void fireProgressChanged(int e) {
572     if (progressListeners != null) {
573       Vector listeners = progressListeners;
574       int count = listeners.size();
575       for (int i = 0; i < count; i++) {
576         ((ProgressListener) listeners.elementAt(i)).progressChanged(e);
577       }
578     }
579   }
580   protected void fireProcessFinished() {
581     if (progressListeners != null) {
582       Vector listeners = progressListeners;
583       int count = listeners.size();
584       for (int i = 0; i < count; i++) {
585         ((ProgressListener) listeners.elementAt(i)).processFinished();
586       }
587     }
588   }
589   public synchronized void removeStatusListener(StatusListener l) {
590     if (statusListeners != null && statusListeners.contains(l)) {
591       Vector v = (Vector) statusListeners.clone();
592       v.removeElement(l);
593       statusListeners = v;
594     }
595   }
596   public synchronized void addStatusListener(StatusListener l) {
597     Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
598     if (!v.contains(l)) {
599       v.addElement(l);
600       statusListeners = v;
601     }
602   }
603   protected void fireStatusChanged(String e) {
604     if (statusListeners != null) {
605       Vector listeners = statusListeners;
606       int count = listeners.size();
607       for (int i = 0; i < count; i++) {
608         ((StatusListener) listeners.elementAt(i)).statusChanged(e);
609       }
610     }
611   }
612 
613   /**
614    * Sets the ontology to be used by the transducers
615    * @param ontology
616    */
617   public void setOntology(gate.creole.ontology.Ontology ontology) {
618     transducer.setOntology(ontology);
619   }
620   public boolean isEnableDebugging() {
621     return enableDebugging;
622   }
623   public void setEnableDebugging(boolean enableDebugging) {
624     this.enableDebugging = enableDebugging;
625     //propagate
626     if(transducer != null) transducer.setEnableDebugging(enableDebugging);
627   }
628 
629 
630   /*
631   private void writeObject(ObjectOutputStream oos) throws IOException {
632     Out.prln("writing batch");
633     oos.defaultWriteObject();
634     Out.prln("finished writing batch");
635   } // writeObject
636   */
637 
638 } // class Batch
639 
640