1   /*
2    *  DumpingPR.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 19/10/2001
12   *
13   *  $Id: DumpingPR.java,v 1.16 2004/07/21 17:10:04 akshay Exp $
14   */
15  
16  package gate.creole.dumpingPR;
17  
18  import java.io.*;
19  import java.net.URL;
20  import java.util.*;
21  
22  import gate.*;
23  import gate.corpora.DocumentImpl;
24  import gate.creole.*;
25  import gate.util.*;
26  
27  /**
28   * This class implements a DumpingPR which exports a given set of annotation
29   * types + the original markup, back into the document's native format.
30   * The export might also include the GATE features of those annotations or
31   * not (the default). One can also control whether the export files have a
32   * new suffix (useSuffixForDumpFiles) and what this suffix is
33   * (suffixForDumpFiles). By default, a suffix is used and it is .gate.
34   */
35  public class DumpingPR extends AbstractLanguageAnalyser
36    implements ProcessingResource {
37  
38    public static final String
39      DPR_DOCUMENT_PARAMETER_NAME = "document";
40  
41    public static final String
42      DPR_ANN_SET_PARAMETER_NAME = "annotationSetName";
43  
44    public static final String
45      DPR_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
46  
47    public static final String
48      DPR_DUMP_TYPES_PARAMETER_NAME = "dumpTypes";
49  
50    public static final String
51      DPR_OUTPUR_URL_PARAMETER_NAME = "outputFileUrl";
52  
53    public static final String
54      DPR_INCLUDE_FEAT_PARAMETER_NAME = "includeFeatures";
55  
56    public static final String
57      DPR_USE_SUFFIX_PARAMETER_NAME = "useSuffixForDumpFiles";
58  
59    public static final String
60      DPR_FILE_SUFFIX_PARAMETER_NAME = "suffixForDumpFiles";
61  
62    private static final boolean DEBUG = true;
63  
64    /**
65     * A list of annotation types, which are to be dumped into the output file
66     */
67    protected List annotationTypes;
68  
69    /**
70     * A list of strings specifying new names to be used instead of the original
71     * annotation types given in the annotationTypes parameter. For example, if
72     * annotationTypes was set to [Location, Date], then if dumpTypes is set to
73     * [Place, Date-expr], then the labels <Place> and <Date-expr> will be inserted
74     * instead of <Location> and <Date>.
75     */
76    protected List dumpTypes;
77  
78    /**the name of the annotation set
79     * from which to take the annotations for dumping
80     */
81    protected String annotationSetName;
82  
83    /**
84     * Whether or not to include the annotation features during export
85     */
86    protected boolean includeFeatures = false;
87  
88    /**
89     * Whether or not to include the annotation features during export
90     */
91    protected boolean useStandOffXML = false;
92  
93    /**
94     * What suffix to use for the dump files. .gate by default, but can be
95     * changed via the set method.
96     */
97    protected String suffixForDumpFiles = ".gate";
98  
99    /**
100    * Whether or not to use the special suffix fo the dump files. True by
101    * default.
102    */
103   protected boolean useSuffixForDumpFiles = true;
104 
105   protected java.net.URL outputFileUrl;
106 
107   private static final String DUMPING_PR_SET = "DumpingPRTempSet";
108 
109   /** Initialise this resource, and return it. */
110   public Resource init() throws ResourceInstantiationException
111   {
112     return super.init();
113   } // init()
114 
115   /**
116   * Reinitialises the processing resource. After calling this method the
117   * resource should be in the state it is after calling init.
118   * If the resource depends on external resources (such as rules files) then
119   * the resource will re-read those resources. If the data used to create
120   * the resource has changed since the resource has been created then the
121   * resource will change too after calling reInit().
122   */
123   public void reInit() throws ResourceInstantiationException
124   {
125     init();
126   } // reInit()
127 
128   /** Run the resource. */
129   public void execute() throws ExecutionException {
130 
131     if(document == null)
132       throw new GateRuntimeException("No document to process!");
133 
134     //if we're saving into standOffXML, then the rest of the settings do
135     //not matter because that toXML method saves everything
136     if (this.useStandOffXML) {
137       write2File();
138       return;
139     }
140 
141     AnnotationSet allAnnots;
142     // get the annotations from document
143     if ((annotationSetName == null)|| (annotationSetName.equals("")))
144       allAnnots = document.getAnnotations();
145     else
146       allAnnots = document.getAnnotations(annotationSetName);
147 
148     //if none found, print warning and exit
149     if ((allAnnots == null) || allAnnots.isEmpty()) {
150       Out.prln("DumpingPR Warning: No annotations found for export. "
151                + "Including only those from the Original markups set.");
152       write2File(null);
153       return;
154     }
155 
156     //if we're saving into standOffXML, then the rest of the settings do
157     //not matter because that toXML method saves everything
158     if (this.useStandOffXML) {
159       write2File();
160       return;
161     }
162 
163     //first transfer the annotation types from a list to a set
164     //don't I just hate this!
165     Set types2Export = new HashSet(annotationTypes);
166 
167     //then get the annotations for export
168     AnnotationSet annots2Export = allAnnots.get(types2Export);
169 
170     //check whether we want the annotations to be renamed before
171     //export (that's what dumpTypes is for)
172     if (dumpTypes != null && !dumpTypes.isEmpty()) {
173       HashMap renameMap = new HashMap();
174       for(int i=0; i<dumpTypes.size() && i<annotationTypes.size(); i++) {
175         //check if we have a corresponding annotationType and if yes,
176         //then add to the hash map for renaming
177         renameMap.put(annotationTypes.get(i), dumpTypes.get(i));
178       }//for
179       //if we have to rename annotations, then do so
180       if(!renameMap.isEmpty() && annots2Export != null)
181         annots2Export = renameAnnotations(annots2Export, renameMap);
182     }//if
183 
184     write2File(annots2Export);
185     document.removeAnnotationSet(DumpingPR.DUMPING_PR_SET);
186 
187   } // execute()
188 
189   protected void write2File(AnnotationSet exportSet) {
190     File outputFile;
191 
192 //      String source = (String) document.getParameterValue("sourceURL");
193 //      URL sourceURL = new URL(source);
194       URL sourceURL = document.getSourceUrl();
195       StringBuffer tempBuff = new StringBuffer(sourceURL.getFile());
196       //now append the special suffix if we want to use it
197       if (useSuffixForDumpFiles)
198         tempBuff.append(this.suffixForDumpFiles);
199       String outputPath = tempBuff.toString();
200       if (DEBUG)
201         Out.prln(outputPath);
202       outputFile = new File(outputPath);
203 
204     try {
205       // Prepare to write into the xmlFile using the doc's encoding if there
206       OutputStreamWriter writer;
207       if (document instanceof DocumentImpl) {
208         String encoding = ((DocumentImpl) document).getEncoding();
209         if (encoding == null || "".equals(encoding))
210           writer = new OutputStreamWriter(new FileOutputStream(outputFile));
211         else
212           writer = new OutputStreamWriter(
213                             new FileOutputStream(outputFile), encoding);
214       } else
215           writer = new OutputStreamWriter(
216                             new FileOutputStream(outputFile));
217 
218       // Write (test the toXml() method)
219       // This Action is added only when a gate.Document is created.
220       // So, is for sure that the resource is a gate.Document
221       writer.write(document.toXml(exportSet, includeFeatures));
222       writer.flush();
223       writer.close();
224     } catch (IOException ex) {
225       throw new GateRuntimeException("Dumping PR: Error writing document "
226                                      + document.getName() + ": "
227                                      + ex.getMessage());
228     }
229 
230 
231   }//write2File
232 
233   protected void write2File() {
234     File outputFile;
235 
236       URL sourceURL = document.getSourceUrl();
237       StringBuffer tempBuff = new StringBuffer(sourceURL.getFile());
238       //now append the special suffix if we want to use it
239       if (useSuffixForDumpFiles)
240         tempBuff.append(this.suffixForDumpFiles);
241       String outputPath = tempBuff.toString();
242       if (DEBUG)
243         Out.prln(outputPath);
244       outputFile = new File(outputPath);
245 
246     try {
247       // Prepare to write into the xmlFile using the doc's encoding if there
248       OutputStreamWriter writer;
249       if (document instanceof DocumentImpl) {
250         String encoding = ((DocumentImpl) document).getEncoding();
251         if (encoding == null || "".equals(encoding))
252           writer = new OutputStreamWriter(new FileOutputStream(outputFile));
253         else
254           writer = new OutputStreamWriter(
255                             new FileOutputStream(outputFile), encoding);
256       } else
257           writer = new OutputStreamWriter(
258                             new FileOutputStream(outputFile));
259 
260       // Write (test the toXml() method)
261       // This Action is added only when a gate.Document is created.
262       // So, is for sure that the resource is a gate.Document
263       writer.write(document.toXml());
264       writer.flush();
265       writer.close();
266     } catch (IOException ex) {
267       throw new GateRuntimeException("Dumping PR: Error writing document "
268                                      + document.getName() + ": "
269                                      + ex.getMessage());
270     }
271 
272 
273   }//write2File
274 
275 
276   protected AnnotationSet renameAnnotations(AnnotationSet annots2Export,
277                                    HashMap renameMap){
278     Iterator iter = annots2Export.iterator();
279     AnnotationSet as = document.getAnnotations(DUMPING_PR_SET);
280     if (!as.isEmpty())
281       as.clear();
282     while(iter.hasNext()) {
283       Annotation annot = (Annotation) iter.next();
284       //first check whether this type needs to be renamed
285       //if not, continue
286       if (!renameMap.containsKey(annot.getType()))
287         renameMap.put(annot.getType(), annot.getType());
288       try{
289         as.add(annot.getId(),
290             annot.getStartNode().getOffset(),
291             annot.getEndNode().getOffset(),
292             (String) renameMap.get(annot.getType()),
293             annot.getFeatures());
294       } catch (InvalidOffsetException ex) {
295         throw new GateRuntimeException("DumpingPR: " + ex.getMessage());
296       }
297     }//while
298     return as;
299   }//renameAnnotations
300 
301 
302   /**get the name of the annotation set*/
303   public String getAnnotationSetName() {
304     return annotationSetName;
305   }//getAnnotationSetName
306 
307   /** set the annotation set name*/
308   public void setAnnotationSetName(String newAnnotationSetName) {
309     annotationSetName = newAnnotationSetName;
310   }//setAnnotationSetName
311 
312   public List getAnnotationTypes() {
313     return this.annotationTypes;
314   }
315 
316   public void setAnnotationTypes(List newTypes) {
317     annotationTypes = newTypes;
318   }
319 
320   public List getDumpTypes() {
321     return this.dumpTypes;
322   }
323 
324   public void setDumpTypes(List newTypes) {
325     dumpTypes = newTypes;
326   }
327 
328   public URL getOutputFileUrl() {
329     return this.outputFileUrl;
330   }
331 
332   public void setOutputFileUrl(URL file) {
333     outputFileUrl = file;
334   }
335 
336   public void setIncludeFeatures(Boolean inclFeatures) {
337     if (inclFeatures != null)
338       includeFeatures = inclFeatures.booleanValue();
339   }
340 
341   public Boolean getIncludeFeatures() {
342     return new Boolean(includeFeatures);
343   }
344 
345   public void setUseStandOffXML(Boolean newValue) {
346     if (newValue != null)
347       useStandOffXML = newValue.booleanValue();
348   }
349 
350   public Boolean getUseStandOffXML() {
351     return new Boolean(useStandOffXML);
352   }
353 
354   public String getSuffixForDumpFiles() {
355     return suffixForDumpFiles;
356   }
357 
358   public void setSuffixForDumpFiles(String newSuffix) {
359     this.suffixForDumpFiles = newSuffix;
360   }
361 
362   public Boolean getUseSuffixForDumpFiles() {
363     return new Boolean(this.useSuffixForDumpFiles);
364   }
365 
366   public void setUseSuffixForDumpFiles(Boolean useOrNot) {
367     if (useOrNot != null)
368       this.useSuffixForDumpFiles = useOrNot.booleanValue();
369   }
370 
371 } // class AnnotationSetTransfer
372