|
DumpingPR |
|
1 /* 2 * DumpingPR.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 19/10/2001 12 * 13 * $Id: DumpingPR.java,v 1.12 2003/07/26 16:42:40 kalina Exp $ 14 */ 15 16 package gate.creole.dumpingPR; 17 18 import java.util.*; 19 import gate.*; 20 import gate.creole.*; 21 import gate.corpora.DocumentImpl; 22 import gate.util.*; 23 import java.net.URL; 24 import java.io.*; 25 26 /** 27 * This class implements a DumpingPR which exports a given set of annotation 28 * types + the original markup, back into the document's native format. 29 * The export might also include the GATE features of those annotations or 30 * not (the default). One can also control whether the export files have a 31 * new suffix (useSuffixForDumpFiles) and what this suffix is 32 * (suffixForDumpFiles). By default, a suffix is used and it is .gate. 33 */ 34 public class DumpingPR extends AbstractLanguageAnalyser 35 implements ProcessingResource { 36 37 public static final String 38 DPR_DOCUMENT_PARAMETER_NAME = "document"; 39 40 public static final String 41 DPR_ANN_SET_PARAMETER_NAME = "annotationSetName"; 42 43 public static final String 44 DPR_ANN_TYPES_PARAMETER_NAME = "annotationTypes"; 45 46 public static final String 47 DPR_DUMP_TYPES_PARAMETER_NAME = "dumpTypes"; 48 49 public static final String 50 DPR_OUTPUR_URL_PARAMETER_NAME = "outputFileUrl"; 51 52 public static final String 53 DPR_INCLUDE_FEAT_PARAMETER_NAME = "includeFeatures"; 54 55 public static final String 56 DPR_USE_SUFFIX_PARAMETER_NAME = "useSuffixForDumpFiles"; 57 58 public static final String 59 DPR_FILE_SUFFIX_PARAMETER_NAME = "suffixForDumpFiles"; 60 61 private static final boolean DEBUG = true; 62 63 /** 64 * A list of annotation types, which are to be dumped into the output file 65 */ 66 protected List annotationTypes; 67 68 /** 69 * A list of strings specifying new names to be used instead of the original 70 * annotation types given in the annotationTypes parameter. For example, if 71 * annotationTypes was set to [Location, Date], then if dumpTypes is set to 72 * [Place, Date-expr], then the labels <Place> and <Date-expr> will be inserted 73 * instead of <Location> and <Date>. 74 */ 75 protected List dumpTypes; 76 77 /**the name of the annotation set 78 * from which to take the annotations for dumping 79 */ 80 protected String annotationSetName; 81 82 /** 83 * Whether or not to include the annotation features during export 84 */ 85 protected boolean includeFeatures = false; 86 87 /** 88 * Whether or not to include the annotation features during export 89 */ 90 protected boolean useStandOffXML = false; 91 92 /** 93 * What suffix to use for the dump files. .gate by default, but can be 94 * changed via the set method. 95 */ 96 protected String suffixForDumpFiles = ".gate"; 97 98 /** 99 * Whether or not to use the special suffix fo the dump files. True by 100 * default. 101 */ 102 protected boolean useSuffixForDumpFiles = true; 103 104 protected java.net.URL outputFileUrl; 105 106 private static final String DUMPING_PR_SET = "DumpingPRTempSet"; 107 108 /** Initialise this resource, and return it. */ 109 public Resource init() throws ResourceInstantiationException 110 { 111 return super.init(); 112 } // init() 113 114 /** 115 * Reinitialises the processing resource. After calling this method the 116 * resource should be in the state it is after calling init. 117 * If the resource depends on external resources (such as rules files) then 118 * the resource will re-read those resources. If the data used to create 119 * the resource has changed since the resource has been created then the 120 * resource will change too after calling reInit(). 121 */ 122 public void reInit() throws ResourceInstantiationException 123 { 124 init(); 125 } // reInit() 126 127 /** Run the resource. */ 128 public void execute() throws ExecutionException { 129 130 if(document == null) 131 throw new GateRuntimeException("No document to process!"); 132 133 //if we're saving into standOffXML, then the rest of the settings do 134 //not matter because that toXML method saves everything 135 if (this.useStandOffXML) { 136 write2File(); 137 return; 138 } 139 140 AnnotationSet allAnnots; 141 // get the annotations from document 142 if ((annotationSetName == null)|| (annotationSetName.equals(""))) 143 allAnnots = document.getAnnotations(); 144 else 145 allAnnots = document.getAnnotations(annotationSetName); 146 147 //if none found, print warning and exit 148 if ((allAnnots == null) || allAnnots.isEmpty()) { 149 Out.prln("DumpingPR Warning: No annotations found for export. " 150 + "Including only those from the Original markups set."); 151 write2File(null); 152 return; 153 } 154 155 //first transfer the annotation types from a list to a set 156 //don't I just hate this! 157 Set types2Export = new HashSet(annotationTypes); 158 159 //then get the annotations for export 160 AnnotationSet annots2Export = allAnnots.get(types2Export); 161 162 //check whether we want the annotations to be renamed before 163 //export (that's what dumpTypes is for) 164 if (dumpTypes != null && !dumpTypes.isEmpty()) { 165 HashMap renameMap = new HashMap(); 166 for(int i=0; i<dumpTypes.size() && i<annotationTypes.size(); i++) { 167 //check if we have a corresponding annotationType and if yes, 168 //then add to the hash map for renaming 169 renameMap.put(annotationTypes.get(i), dumpTypes.get(i)); 170 }//for 171 //if we have to rename annotations, then do so 172 if(!renameMap.isEmpty() && annots2Export != null) 173 annots2Export = renameAnnotations(annots2Export, renameMap); 174 }//if 175 176 write2File(annots2Export); 177 document.removeAnnotationSet(this.DUMPING_PR_SET); 178 179 } // execute() 180 181 protected void write2File(AnnotationSet exportSet) { 182 File outputFile; 183 184 // String source = (String) document.getParameterValue("sourceURL"); 185 // URL sourceURL = new URL(source); 186 URL sourceURL = document.getSourceUrl(); 187 StringBuffer tempBuff = new StringBuffer(sourceURL.getFile()); 188 //now append the special suffix if we want to use it 189 if (useSuffixForDumpFiles) 190 tempBuff.append(this.suffixForDumpFiles); 191 String outputPath = tempBuff.toString(); 192 if (DEBUG) 193 Out.prln(outputPath); 194 outputFile = new File(outputPath); 195 196 try { 197 // Prepare to write into the xmlFile using the doc's encoding if there 198 OutputStreamWriter writer; 199 if (document instanceof DocumentImpl) { 200 String encoding = ((DocumentImpl) document).getEncoding(); 201 if (encoding == null || "".equals(encoding)) 202 writer = new OutputStreamWriter(new FileOutputStream(outputFile)); 203 else 204 writer = new OutputStreamWriter( 205 new FileOutputStream(outputFile), encoding); 206 } else 207 writer = new OutputStreamWriter( 208 new FileOutputStream(outputFile)); 209 210 // Write (test the toXml() method) 211 // This Action is added only when a gate.Document is created. 212 // So, is for sure that the resource is a gate.Document 213 writer.write(document.toXml(exportSet, includeFeatures)); 214 writer.flush(); 215 writer.close(); 216 } catch (IOException ex) { 217 throw new GateRuntimeException("Dumping PR: Error writing document " 218 + document.getName() + ": " 219 + ex.getMessage()); 220 } 221 222 223 }//write2File 224 225 protected void write2File() { 226 File outputFile; 227 228 URL sourceURL = document.getSourceUrl(); 229 StringBuffer tempBuff = new StringBuffer(sourceURL.getFile()); 230 //now append the special suffix if we want to use it 231 if (useSuffixForDumpFiles) 232 tempBuff.append(this.suffixForDumpFiles); 233 String outputPath = tempBuff.toString(); 234 if (DEBUG) 235 Out.prln(outputPath); 236 outputFile = new File(outputPath); 237 238 try { 239 // Prepare to write into the xmlFile using the doc's encoding if there 240 OutputStreamWriter writer; 241 if (document instanceof DocumentImpl) { 242 String encoding = ((DocumentImpl) document).getEncoding(); 243 if (encoding == null || "".equals(encoding)) 244 writer = new OutputStreamWriter(new FileOutputStream(outputFile)); 245 else 246 writer = new OutputStreamWriter( 247 new FileOutputStream(outputFile), encoding); 248 } else 249 writer = new OutputStreamWriter( 250 new FileOutputStream(outputFile)); 251 252 // Write (test the toXml() method) 253 // This Action is added only when a gate.Document is created. 254 // So, is for sure that the resource is a gate.Document 255 writer.write(document.toXml()); 256 writer.flush(); 257 writer.close(); 258 } catch (IOException ex) { 259 throw new GateRuntimeException("Dumping PR: Error writing document " 260 + document.getName() + ": " 261 + ex.getMessage()); 262 } 263 264 265 }//write2File 266 267 268 protected AnnotationSet renameAnnotations(AnnotationSet annots2Export, 269 HashMap renameMap){ 270 Iterator iter = annots2Export.iterator(); 271 AnnotationSet as = document.getAnnotations(DUMPING_PR_SET); 272 if (!as.isEmpty()) 273 as.clear(); 274 while(iter.hasNext()) { 275 Annotation annot = (Annotation) iter.next(); 276 //first check whether this type needs to be renamed 277 //if not, continue 278 if (!renameMap.containsKey(annot.getType())) 279 renameMap.put(annot.getType(), annot.getType()); 280 try{ 281 as.add(annot.getId(), 282 annot.getStartNode().getOffset(), 283 annot.getEndNode().getOffset(), 284 (String) renameMap.get(annot.getType()), 285 annot.getFeatures()); 286 } catch (InvalidOffsetException ex) { 287 throw new GateRuntimeException("DumpingPR: " + ex.getMessage()); 288 } 289 }//while 290 return as; 291 }//renameAnnotations 292 293 294 /**get the name of the annotation set*/ 295 public String getAnnotationSetName() { 296 return annotationSetName; 297 }//getAnnotationSetName 298 299 /** set the annotation set name*/ 300 public void setAnnotationSetName(String newAnnotationSetName) { 301 annotationSetName = newAnnotationSetName; 302 }//setAnnotationSetName 303 304 public List getAnnotationTypes() { 305 return this.annotationTypes; 306 } 307 308 public void setAnnotationTypes(List newTypes) { 309 annotationTypes = newTypes; 310 } 311 312 public List getDumpTypes() { 313 return this.dumpTypes; 314 } 315 316 public void setDumpTypes(List newTypes) { 317 dumpTypes = newTypes; 318 } 319 320 public URL getOutputFileUrl() { 321 return this.outputFileUrl; 322 } 323 324 public void setOutputFileUrl(URL file) { 325 outputFileUrl = file; 326 } 327 328 public void setIncludeFeatures(Boolean inclFeatures) { 329 if (inclFeatures != null) 330 includeFeatures = inclFeatures.booleanValue(); 331 } 332 333 public Boolean getIncludeFeatures() { 334 return new Boolean(includeFeatures); 335 } 336 337 public void setUseStandOffXML(Boolean newValue) { 338 if (newValue != null) 339 useStandOffXML = newValue.booleanValue(); 340 } 341 342 public Boolean getUseStandOffXML() { 343 return new Boolean(useStandOffXML); 344 } 345 346 public String getSuffixForDumpFiles() { 347 return suffixForDumpFiles; 348 } 349 350 public void setSuffixForDumpFiles(String newSuffix) { 351 this.suffixForDumpFiles = newSuffix; 352 } 353 354 public Boolean getUseSuffixForDumpFiles() { 355 return new Boolean(this.useSuffixForDumpFiles); 356 } 357 358 public void setUseSuffixForDumpFiles(Boolean useOrNot) { 359 if (useOrNot != null) 360 this.useSuffixForDumpFiles = useOrNot.booleanValue(); 361 } 362 363 } // class AnnotationSetTransfer 364
|
DumpingPR |
|