1   /*
2    *  DumpingPR.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 19/10/2001
12   *
13   *  $Id: DumpingPR.java,v 1.12 2003/07/26 16:42:40 kalina Exp $
14   */
15  
16  package gate.creole.dumpingPR;
17  
18  import java.util.*;
19  import gate.*;
20  import gate.creole.*;
21  import gate.corpora.DocumentImpl;
22  import gate.util.*;
23  import java.net.URL;
24  import java.io.*;
25  
26  /**
27   * This class implements a DumpingPR which exports a given set of annotation
28   * types + the original markup, back into the document's native format.
29   * The export might also include the GATE features of those annotations or
30   * not (the default). One can also control whether the export files have a
31   * new suffix (useSuffixForDumpFiles) and what this suffix is
32   * (suffixForDumpFiles). By default, a suffix is used and it is .gate.
33   */
34  public class DumpingPR extends AbstractLanguageAnalyser
35    implements ProcessingResource {
36  
37    public static final String
38      DPR_DOCUMENT_PARAMETER_NAME = "document";
39  
40    public static final String
41      DPR_ANN_SET_PARAMETER_NAME = "annotationSetName";
42  
43    public static final String
44      DPR_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
45  
46    public static final String
47      DPR_DUMP_TYPES_PARAMETER_NAME = "dumpTypes";
48  
49    public static final String
50      DPR_OUTPUR_URL_PARAMETER_NAME = "outputFileUrl";
51  
52    public static final String
53      DPR_INCLUDE_FEAT_PARAMETER_NAME = "includeFeatures";
54  
55    public static final String
56      DPR_USE_SUFFIX_PARAMETER_NAME = "useSuffixForDumpFiles";
57  
58    public static final String
59      DPR_FILE_SUFFIX_PARAMETER_NAME = "suffixForDumpFiles";
60  
61    private static final boolean DEBUG = true;
62  
63    /**
64     * A list of annotation types, which are to be dumped into the output file
65     */
66    protected List annotationTypes;
67  
68    /**
69     * A list of strings specifying new names to be used instead of the original
70     * annotation types given in the annotationTypes parameter. For example, if
71     * annotationTypes was set to [Location, Date], then if dumpTypes is set to
72     * [Place, Date-expr], then the labels <Place> and <Date-expr> will be inserted
73     * instead of <Location> and <Date>.
74     */
75    protected List dumpTypes;
76  
77    /**the name of the annotation set
78     * from which to take the annotations for dumping
79     */
80    protected String annotationSetName;
81  
82    /**
83     * Whether or not to include the annotation features during export
84     */
85    protected boolean includeFeatures = false;
86  
87    /**
88     * Whether or not to include the annotation features during export
89     */
90    protected boolean useStandOffXML = false;
91  
92    /**
93     * What suffix to use for the dump files. .gate by default, but can be
94     * changed via the set method.
95     */
96    protected String suffixForDumpFiles = ".gate";
97  
98    /**
99     * Whether or not to use the special suffix fo the dump files. True by
100    * default.
101    */
102   protected boolean useSuffixForDumpFiles = true;
103 
104   protected java.net.URL outputFileUrl;
105 
106   private static final String DUMPING_PR_SET = "DumpingPRTempSet";
107 
108   /** Initialise this resource, and return it. */
109   public Resource init() throws ResourceInstantiationException
110   {
111     return super.init();
112   } // init()
113 
114   /**
115   * Reinitialises the processing resource. After calling this method the
116   * resource should be in the state it is after calling init.
117   * If the resource depends on external resources (such as rules files) then
118   * the resource will re-read those resources. If the data used to create
119   * the resource has changed since the resource has been created then the
120   * resource will change too after calling reInit().
121   */
122   public void reInit() throws ResourceInstantiationException
123   {
124     init();
125   } // reInit()
126 
127   /** Run the resource. */
128   public void execute() throws ExecutionException {
129 
130     if(document == null)
131       throw new GateRuntimeException("No document to process!");
132 
133     //if we're saving into standOffXML, then the rest of the settings do
134     //not matter because that toXML method saves everything
135     if (this.useStandOffXML) {
136       write2File();
137       return;
138     }
139 
140     AnnotationSet allAnnots;
141     // get the annotations from document
142     if ((annotationSetName == null)|| (annotationSetName.equals("")))
143       allAnnots = document.getAnnotations();
144     else
145       allAnnots = document.getAnnotations(annotationSetName);
146 
147     //if none found, print warning and exit
148     if ((allAnnots == null) || allAnnots.isEmpty()) {
149       Out.prln("DumpingPR Warning: No annotations found for export. "
150                + "Including only those from the Original markups set.");
151       write2File(null);
152       return;
153     }
154 
155     //first transfer the annotation types from a list to a set
156     //don't I just hate this!
157     Set types2Export = new HashSet(annotationTypes);
158 
159     //then get the annotations for export
160     AnnotationSet annots2Export = allAnnots.get(types2Export);
161 
162     //check whether we want the annotations to be renamed before
163     //export (that's what dumpTypes is for)
164     if (dumpTypes != null && !dumpTypes.isEmpty()) {
165       HashMap renameMap = new HashMap();
166       for(int i=0; i<dumpTypes.size() && i<annotationTypes.size(); i++) {
167         //check if we have a corresponding annotationType and if yes,
168         //then add to the hash map for renaming
169         renameMap.put(annotationTypes.get(i), dumpTypes.get(i));
170       }//for
171       //if we have to rename annotations, then do so
172       if(!renameMap.isEmpty() && annots2Export != null)
173         annots2Export = renameAnnotations(annots2Export, renameMap);
174     }//if
175 
176     write2File(annots2Export);
177     document.removeAnnotationSet(this.DUMPING_PR_SET);
178 
179   } // execute()
180 
181   protected void write2File(AnnotationSet exportSet) {
182     File outputFile;
183 
184 //      String source = (String) document.getParameterValue("sourceURL");
185 //      URL sourceURL = new URL(source);
186       URL sourceURL = document.getSourceUrl();
187       StringBuffer tempBuff = new StringBuffer(sourceURL.getFile());
188       //now append the special suffix if we want to use it
189       if (useSuffixForDumpFiles)
190         tempBuff.append(this.suffixForDumpFiles);
191       String outputPath = tempBuff.toString();
192       if (DEBUG)
193         Out.prln(outputPath);
194       outputFile = new File(outputPath);
195 
196     try {
197       // Prepare to write into the xmlFile using the doc's encoding if there
198       OutputStreamWriter writer;
199       if (document instanceof DocumentImpl) {
200         String encoding = ((DocumentImpl) document).getEncoding();
201         if (encoding == null || "".equals(encoding))
202           writer = new OutputStreamWriter(new FileOutputStream(outputFile));
203         else
204           writer = new OutputStreamWriter(
205                             new FileOutputStream(outputFile), encoding);
206       } else
207           writer = new OutputStreamWriter(
208                             new FileOutputStream(outputFile));
209 
210       // Write (test the toXml() method)
211       // This Action is added only when a gate.Document is created.
212       // So, is for sure that the resource is a gate.Document
213       writer.write(document.toXml(exportSet, includeFeatures));
214       writer.flush();
215       writer.close();
216     } catch (IOException ex) {
217       throw new GateRuntimeException("Dumping PR: Error writing document "
218                                      + document.getName() + ": "
219                                      + ex.getMessage());
220     }
221 
222 
223   }//write2File
224 
225   protected void write2File() {
226     File outputFile;
227 
228       URL sourceURL = document.getSourceUrl();
229       StringBuffer tempBuff = new StringBuffer(sourceURL.getFile());
230       //now append the special suffix if we want to use it
231       if (useSuffixForDumpFiles)
232         tempBuff.append(this.suffixForDumpFiles);
233       String outputPath = tempBuff.toString();
234       if (DEBUG)
235         Out.prln(outputPath);
236       outputFile = new File(outputPath);
237 
238     try {
239       // Prepare to write into the xmlFile using the doc's encoding if there
240       OutputStreamWriter writer;
241       if (document instanceof DocumentImpl) {
242         String encoding = ((DocumentImpl) document).getEncoding();
243         if (encoding == null || "".equals(encoding))
244           writer = new OutputStreamWriter(new FileOutputStream(outputFile));
245         else
246           writer = new OutputStreamWriter(
247                             new FileOutputStream(outputFile), encoding);
248       } else
249           writer = new OutputStreamWriter(
250                             new FileOutputStream(outputFile));
251 
252       // Write (test the toXml() method)
253       // This Action is added only when a gate.Document is created.
254       // So, is for sure that the resource is a gate.Document
255       writer.write(document.toXml());
256       writer.flush();
257       writer.close();
258     } catch (IOException ex) {
259       throw new GateRuntimeException("Dumping PR: Error writing document "
260                                      + document.getName() + ": "
261                                      + ex.getMessage());
262     }
263 
264 
265   }//write2File
266 
267 
268   protected AnnotationSet renameAnnotations(AnnotationSet annots2Export,
269                                    HashMap renameMap){
270     Iterator iter = annots2Export.iterator();
271     AnnotationSet as = document.getAnnotations(DUMPING_PR_SET);
272     if (!as.isEmpty())
273       as.clear();
274     while(iter.hasNext()) {
275       Annotation annot = (Annotation) iter.next();
276       //first check whether this type needs to be renamed
277       //if not, continue
278       if (!renameMap.containsKey(annot.getType()))
279         renameMap.put(annot.getType(), annot.getType());
280       try{
281         as.add(annot.getId(),
282             annot.getStartNode().getOffset(),
283             annot.getEndNode().getOffset(),
284             (String) renameMap.get(annot.getType()),
285             annot.getFeatures());
286       } catch (InvalidOffsetException ex) {
287         throw new GateRuntimeException("DumpingPR: " + ex.getMessage());
288       }
289     }//while
290     return as;
291   }//renameAnnotations
292 
293 
294   /**get the name of the annotation set*/
295   public String getAnnotationSetName() {
296     return annotationSetName;
297   }//getAnnotationSetName
298 
299   /** set the annotation set name*/
300   public void setAnnotationSetName(String newAnnotationSetName) {
301     annotationSetName = newAnnotationSetName;
302   }//setAnnotationSetName
303 
304   public List getAnnotationTypes() {
305     return this.annotationTypes;
306   }
307 
308   public void setAnnotationTypes(List newTypes) {
309     annotationTypes = newTypes;
310   }
311 
312   public List getDumpTypes() {
313     return this.dumpTypes;
314   }
315 
316   public void setDumpTypes(List newTypes) {
317     dumpTypes = newTypes;
318   }
319 
320   public URL getOutputFileUrl() {
321     return this.outputFileUrl;
322   }
323 
324   public void setOutputFileUrl(URL file) {
325     outputFileUrl = file;
326   }
327 
328   public void setIncludeFeatures(Boolean inclFeatures) {
329     if (inclFeatures != null)
330       includeFeatures = inclFeatures.booleanValue();
331   }
332 
333   public Boolean getIncludeFeatures() {
334     return new Boolean(includeFeatures);
335   }
336 
337   public void setUseStandOffXML(Boolean newValue) {
338     if (newValue != null)
339       useStandOffXML = newValue.booleanValue();
340   }
341 
342   public Boolean getUseStandOffXML() {
343     return new Boolean(useStandOffXML);
344   }
345 
346   public String getSuffixForDumpFiles() {
347     return suffixForDumpFiles;
348   }
349 
350   public void setSuffixForDumpFiles(String newSuffix) {
351     this.suffixForDumpFiles = newSuffix;
352   }
353 
354   public Boolean getUseSuffixForDumpFiles() {
355     return new Boolean(this.useSuffixForDumpFiles);
356   }
357 
358   public void setUseSuffixForDumpFiles(Boolean useOrNot) {
359     if (useOrNot != null)
360       this.useSuffixForDumpFiles = useOrNot.booleanValue();
361   }
362 
363 } // class AnnotationSetTransfer
364