|
APFormatExporter |
|
1 /* 2 * APFormatExporter.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/Oct/2001 12 * 13 * $Id: APFormatExporter.java,v 1.24 2002/09/05 13:01:25 diana Exp $ 14 */ 15 16 package gate.creole; 17 18 import gate.*; 19 import gate.creole.orthomatcher.*; 20 import gate.creole.ANNIEConstants; 21 import gate.util.*; 22 23 import java.util.*; 24 import java.net.*; 25 import java.io.*; 26 27 /** This class implements a APF xml exporter. It works on documents or corpora 28 * to export them in the APF format. 29 */ 30 public class APFormatExporter extends AbstractLanguageAnalyser 31 implements ANNIEConstants{ 32 public static final String 33 APF_EXP_DOCUMENT_PARAMETER_NAME = "document"; 34 35 public static final String 36 APF_EXP_SOURCE_PARAMETER_NAME = "source"; 37 38 public static final String 39 APF_EXP_DTD_PARAMETER_NAME = "dtdFileName"; 40 41 public static final String 42 APF_EXP_PATH_PARAMETER_NAME = "exportFilePath"; 43 44 public static final String 45 APF_EXP_TYPES_PARAMETER_NAME = "exportedTypes"; 46 47 public static final String 48 APF_EXP_WRITE_SOURCE_PARAMETER_NAME = "isSourceWritten"; 49 50 /** Debug flag */ 51 private static final boolean DEBUG = false; 52 /** Constructor does nothing. This PR is bean like initialized*/ 53 public APFormatExporter() {} 54 55 /** Run the resource and does the entire export process*/ 56 public void execute() throws ExecutionException{ 57 // Check if the thing can be run 58 if(document == null) 59 throw new ExecutionException("No document found to export in APF format!"); 60 if (exportedTypes == null) 61 throw new ExecutionException("No export types found."); 62 xmlDoc = new StringBuffer(10*(document.getContent().size().intValue())); 63 initDocId(); 64 if (docId == null) 65 throw new ExecutionException("Couldn't detect the document's ID"); 66 if (DEBUG) 67 Out.prln("Document id = "+ docId); 68 69 String exportFilePathStr = null; 70 if (exportFilePath == null) 71 exportFilePathStr = new String(document.getSourceUrl().getFile() + 72 ".apf.xml"); 73 else 74 exportFilePathStr = exportFilePath.getPath()+ "/" 75 + gate.util.Files.getLastPathComponent( 76 document.getSourceUrl().getFile()) + ".apf.xml"; 77 78 if (DEBUG) 79 Out.prln("Export file path = "+ exportFilePathStr); 80 //* 81 // Prepare to write into the xmlFile 82 OutputStreamWriter writer = null; 83 try{ 84 writer = new OutputStreamWriter( 85 new FileOutputStream(new File(exportFilePathStr))); 86 87 // Write (test the toXml() method) 88 // This Action is added only when a gate.Document is created. 89 // So, is Bor sure that the resource is a gate.Document 90 serializeDocumentToAPF(); 91 writer.write(xmlDoc.toString()); 92 writer.flush(); 93 writer.close(); 94 }catch (Exception e){ 95 throw new ExecutionException(e); 96 }// End try 97 //*/ 98 } // execute() 99 100 101 /** Initialise this resource, and returns it. */ 102 public Resource init() throws ResourceInstantiationException { 103 return this; 104 } // init() 105 106 /** Java bean style mutator for exportedTypes */ 107 public void setExportedTypes(List anExportedTypesList){ 108 exportedTypes = anExportedTypesList; 109 }// setExportedTypes(); 110 111 /** Java bean style accesor for exportedTypes */ 112 public List getExportedTypes(){ 113 return exportedTypes; 114 }// getExportedTypes() 115 116 /** Java bean style mutator for dtdFileName */ 117 public void setDtdFileName(String aDtdFileName){ 118 dtdFileName = aDtdFileName; 119 }// setDtdFileName(); 120 121 /** Java bean style accesor for DtdFileName */ 122 public String getDtdFileName(){ 123 return dtdFileName; 124 }// getDtdFileName() 125 126 /** Java bean style mutator for exportFilePath */ 127 public void setExportFilePath(URL anExportFilePath){ 128 exportFilePath = anExportFilePath; 129 }// setExportFilePath(); 130 131 /** Java bean style accesor for exportFilePath */ 132 public URL getExportFilePath(){ 133 return exportFilePath; 134 }// getDtdFileName() 135 136 /** Java bean style mutator for source */ 137 public void setSource(String aSource){ 138 source = aSource; 139 }// setSource(); 140 141 /** Java bean style accesor for source */ 142 public String getSource(){ 143 return source; 144 }// getSource() 145 146 /** Java bean style accesor for isSourceWritten */ 147 public Boolean getIsSourceWritten() { 148 return new Boolean(isSourceWritten); 149 } 150 151 /** Java bean style mutator for isSourceWritten */ 152 public void setIsSourceWritten(Boolean aIsSourceWritten){ 153 isSourceWritten = aIsSourceWritten.booleanValue(); 154 }// setIsSourceWritten(); 155 156 157 158 /** Initialises the docId with documents' file name without the complete path*/ 159 private void initDocId(){ 160 String fileName = ""; 161 fileName = gate.util.Files.getLastPathComponent( 162 document.getSourceUrl().getFile()); 163 // File name contains now the last token 164 if (DEBUG) 165 Out.prln("From initDocId, fileName ="+ fileName); 166 StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,"."); 167 StringBuffer tmpDocId = new StringBuffer(""); 168 while(fileNameTokenizer.hasMoreTokens()){ 169 String token = (String)fileNameTokenizer.nextToken(); 170 // We don't want to append the last token 171 if (fileNameTokenizer.hasMoreTokens()) 172 tmpDocId.append(token + "."); 173 }// End while 174 // if tokenization had place 175 if (!"".equals(tmpDocId)){ 176 // Remove the last dot 177 tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),""); 178 docId = tmpDocId.toString(); 179 }// End if 180 }// initDocId() 181 182 /** Returns the xml document conforming to APF dtd.*/ 183 protected void serializeDocumentToAPF(){ 184 xmlDoc.append("<?xml version=\"1.0\" ?>\n"); 185 xmlDoc.append("<!DOCTYPE source_file SYSTEM "); 186 if (dtdFileName == null) 187 xmlDoc.append("\"ace-rdc.v2.0.1.dtd\""); 188 else 189 xmlDoc.append("\""+dtdFileName+"\""); 190 xmlDoc.append(">\n"); 191 xmlDoc.append("<source_file TYPE=\"text\""); 192 if (isSourceWritten) { 193 AnnotationSet docTypeAnns = document.getAnnotations( 194 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("DOCTYPE"); 195 if (docTypeAnns == null || docTypeAnns.isEmpty()) 196 xmlDoc.append(" SOURCE=\""+ source+ "\" "); 197 else { 198 Annotation docTypeAnn = (Annotation) docTypeAnns.iterator().next(); 199 if (docTypeAnn.getFeatures().get("SOURCE") == null) 200 xmlDoc.append(" SOURCE=\""+ source+ "\" "); 201 else 202 xmlDoc.append(" SOURCE=\""+ docTypeAnn.getFeatures().get("SOURCE")+ "\" "); 203 }//if no doc type annotations 204 } 205 xmlDoc.append("VERSION=\"2.0\" URI=\""); 206 xmlDoc.append(docId); 207 xmlDoc.append("-lf\">\n"); 208 xmlDoc.append(" <document DOCID=\""); 209 xmlDoc.append(docId + "\">\n"); 210 serializeEntities(); 211 xmlDoc.append(" </document>\n"); 212 xmlDoc.append("</source_file>"); 213 }// serializeDocumentToAPF() 214 215 /** Transforms all the entities from exportedTypes found in the GATE document 216 * into their xml representation 217 */ 218 protected void serializeEntities(){ 219 // If no types founded then simply return 220 if (exportedTypes == null || exportedTypes.isEmpty()) return; 221 222 Map entitiesMap = null; 223 if ( document.getFeatures() == null || 224 document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null) 225 entitiesMap = new HashMap(); 226 else 227 entitiesMap = (Map)document.getFeatures(). 228 get(DOCUMENT_COREF_FEATURE_NAME); 229 Map namedAnnotSetMap = null; 230 if (document.getNamedAnnotationSets() == null) 231 namedAnnotSetMap = new HashMap(); 232 else 233 namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets()); 234 // Add the default annoattion set 235 namedAnnotSetMap.put(null,document.getAnnotations()); 236 // The entities map is a map from annotation sets names to list of lists 237 // Each list element is composed from annotations refering the same entity 238 // All the entities that are in the exportedTypes need to be serialized. 239 Iterator exportedTypesIter = exportedTypes.iterator(); 240 while(exportedTypesIter.hasNext()){ 241 String entityType = (String)exportedTypesIter.next(); 242 // Serialize all entities of type 243 // The keys in the entitesMap are annotation sets names. The null key 244 // designates the default annotation. 245 Set annotationSetNames = namedAnnotSetMap.keySet(); 246 Iterator annotationSetNamesIter = annotationSetNames.iterator(); 247 while (annotationSetNamesIter.hasNext()){ 248 Object annotSetName = annotationSetNamesIter.next(); 249 // This list contains entities found in the annotSetName 250 List entitiesList = (List) entitiesMap.get(annotSetName); 251 if (entitiesList == null) entitiesList = new ArrayList(); 252 // This annotation set will contain all annotations of "entityType" 253 AnnotationSet annotSet = null; 254 Set serializationAnnotSet = null; 255 annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName); 256 if (annotSet == null || annotSet.get(entityType) == null) continue; 257 serializationAnnotSet = new HashSet(annotSet.get(entityType)); 258 // All annotations from annotSet will be serialized as entities unless 259 // some of them are present in the entities map 260 // Now we are searching for the entityType in the entitiesMap and 261 // serialize it from there. After that, remove all annotations 262 // entityType present in entitiesMap from annotSet and serialize the 263 // remaining entities. 264 //Iterate through the entitiesList in searching for entityType 265 Iterator entitiesListIter = entitiesList.iterator(); 266 while (entitiesListIter.hasNext()){ 267 List entity = (List)entitiesListIter.next(); 268 // We want now to accesate an annotation from the entity list to get 269 // its type and compare it with entityType 270 String theEntityType = new String(""); 271 if (entity != null && !entity.isEmpty()){ 272 Integer annotId = (Integer)entity.get(0); 273 Annotation a = (Annotation)annotSet.get(annotId); 274 if (a != null) theEntityType = a.getType(); 275 }// End if 276 // The the types are equal then serialize the entities 277 if (theEntityType.equals(entityType)){ 278 List ent = new ArrayList(); 279 Iterator entityIter = entity.iterator(); 280 while(entityIter.hasNext()){ 281 Integer id = (Integer)entityIter.next(); 282 ent.add(annotSet.get(id)); 283 }// End while 284 serializeAnEntity(ent); 285 // Remove all annotation from entity that apear in annotSet 286 serializationAnnotSet.removeAll(ent); 287 }// End if 288 }// End while(entitiesListIter.hasNext()) 289 // Serialize the remaining entities in annotSet 290 Iterator serializationAnnotSetIter = serializationAnnotSet.iterator(); 291 while(serializationAnnotSetIter.hasNext()){ 292 Annotation annotEntity = (Annotation) serializationAnnotSetIter.next(); 293 List ent = new ArrayList(); 294 ent.add(annotEntity); 295 serializeAnEntity(ent); 296 }// End while(annotSetIter.hasNext()) 297 }// End while(entitiesKeysIter.hasNext()) 298 }// End while(exportedTypesIter.hasNext()) 299 }// serializeEntities() 300 301 /** Writes an entity in the xmlDoc conforming to APF standards. 302 * @param anEntity represents a list with annotations that refer the same 303 * entity. Those annotations were detected and constructed by the 304 * orthomatcher. 305 */ 306 private void serializeAnEntity(List anEntity){ 307 if (anEntity == null || anEntity.isEmpty()) return; 308 // Write the entities tags 309 xmlDoc.append(" <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n"); 310 // We know for sure that the list is not empty (see above) 311 Annotation a = (Annotation) anEntity.get(0); 312 xmlDoc.append(" <entity_type GENERIC=\"FALSE\">" + a.getType().toUpperCase() + 313 "</entity_type>\n"); 314 // Write the entities mentions 315 Iterator anEntityIter = anEntity.iterator(); 316 while(anEntityIter.hasNext()){ 317 Annotation ann = (Annotation)anEntityIter.next(); 318 serializeAnEntityMention(ann); 319 }// End while(anEntityIter.hasNext()) 320 // Write the entities attributes 321 xmlDoc.append(" <entity_attributes>\n"); 322 anEntityIter = anEntity.iterator(); 323 while(anEntityIter.hasNext()){ 324 Annotation ann = (Annotation)anEntityIter.next(); 325 serializeAnEntityAttributes(ann); 326 }// End while(anEntityIter.hasNext()) 327 xmlDoc.append(" </entity_attributes>\n"); 328 xmlDoc.append(" </entity>\n"); 329 }// End serializeAnEntity(); 330 331 /** This method serializes an entity mention from an Annotation*/ 332 private void serializeAnEntityMention(Annotation ann){ 333 if (ann == null) return; 334 String entityMentionType = "NAME"; 335 String entityMentionRole = null; 336 String entityMentionReference = null; 337 String entityMentionGeneric = null; 338 339 FeatureMap fm = ann.getFeatures(); 340 if (fm != null){ 341 if( null != fm.get("ENTITY_MENTION_TYPE")) 342 entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE"); 343 344 entityMentionRole = (String) fm.get("ROLE"); 345 entityMentionReference = (String) fm.get("REFERENCE"); 346 entityMentionGeneric = (String) fm.get("GENERIC"); 347 }// End if 348 String str1 = (entityMentionRole == null)? "" : 349 ("ROLE=\"" + entityMentionRole + "\""); 350 String str2 = (entityMentionReference == null)? "" : 351 ("REFERENCE=\"" + entityMentionReference + "\""); 352 String str3 = (entityMentionGeneric == null)? "" : 353 ("GENERIC=\"" + entityMentionGeneric + "\""); 354 355 /* modified by Di - the new scorer needs a unique ID for each mention as well */ 356 357 xmlDoc.append(" <entity_mention TYPE=\"" + entityMentionType+"\"" + 358 str1 + " " + str2 + " " + str3 + "ID=\"" + "M" + getNextMentionId() +"\">\n" 359 ); 360 361 // extent 362 xmlDoc.append(" <extent>\n"); 363 xmlDoc.append(" <charseq>\n"); 364 try{ 365 xmlDoc.append(" <!-- string = \"" + 366 document.getContent().getContent(ann.getStartNode().getOffset(), 367 ann.getEndNode().getOffset())+"\" -->\n"); 368 }catch (InvalidOffsetException ioe){ 369 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 370 " offsets:" + ann.getStartNode().getOffset() + " and "+ 371 ann.getEndNode().getOffset()); 372 }// End try 373 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 374 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 375 xmlDoc.append(" </charseq>\n"); 376 xmlDoc.append(" </extent>\n"); 377 // head 378 xmlDoc.append(" <head>\n"); 379 xmlDoc.append(" <charseq>\n"); 380 try{ 381 xmlDoc.append(" <!-- string = \"" + 382 document.getContent().getContent(ann.getStartNode().getOffset(), 383 ann.getEndNode().getOffset())+"\" -->\n"); 384 }catch (InvalidOffsetException ioe){ 385 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 386 " offsets:" + ann.getStartNode().getOffset() + " and "+ 387 ann.getEndNode().getOffset()); 388 }// End try 389 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 390 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 391 xmlDoc.append(" </charseq>\n"); 392 xmlDoc.append(" </head>\n"); 393 xmlDoc.append(" </entity_mention>\n"); 394 }//serializeAnEntityMention(); 395 396 /** This method serializes an entity attribute from an Annotation*/ 397 private void serializeAnEntityAttributes(Annotation ann){ 398 if (ann == null) return; 399 boolean isAttribute = false; 400 if ("NAME".equals(ann.getFeatures().get("ENTITY_MENTION_TYPE")) 401 || 402 null == ann.getFeatures().get("ENTITY_MENTION_TYPE")) 403 isAttribute = true; 404 if (! isAttribute) 405 return; 406 407 // name 408 xmlDoc.append(" <name>\n"); 409 xmlDoc.append(" <charseq>\n"); 410 try{ 411 xmlDoc.append(" <!-- string = \"" + 412 document.getContent().getContent(ann.getStartNode().getOffset(), 413 ann.getEndNode().getOffset())+"\" -->\n"); 414 }catch (InvalidOffsetException ioe){ 415 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 416 " offsets:" + ann.getStartNode().getOffset() + " and "+ 417 ann.getEndNode().getOffset()); 418 }// End try 419 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 420 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 421 xmlDoc.append(" </charseq>\n"); 422 xmlDoc.append(" </name>\n"); 423 }//serializeAnEntityMention(); 424 425 /** Returns the next safe ID for an entity*/ 426 private int getNextEntityId(){ 427 return entityId ++; 428 }// getNextEntityId() 429 430 /** added by Di - returns the next safe ID for an entity mention */ 431 private int getNextMentionId(){ 432 return mentionId ++; 433 } 434 435 436 /** This list of strings represents the entities type that will be exported*/ 437 private List exportedTypes = null; 438 /** This is the name of the dtd file. If it's not present no dtd would be 439 * written in the APF file. 440 */ 441 private String dtdFileName = null; 442 /** This field represent the document id and it is used in generating the 443 * entities IDs. It is the file name of the document, without the extension 444 */ 445 private String docId = null; 446 447 /** This field represent an unique entity ID generator*/ 448 private int entityId = 1; 449 450 /** added by Di - this field represents a unique entity ID generator */ 451 private int mentionId = 1; 452 453 /** This is the xmlDoc that will be created*/ 454 private StringBuffer xmlDoc = null; 455 456 private URL exportFilePath = null; 457 458 /** The source attribute for source*/ 459 private String source = null; 460 461 /** The source attribute for source*/ 462 private boolean isSourceWritten = true; 463 464 465 }// APFormatExporter 466
|
APFormatExporter |
|