1   /*
2    *  APFormatExporter.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/Oct/2001
12   *
13   *  $Id: APFormatExporter.java,v 1.24 2002/09/05 13:01:25 diana Exp $
14   */
15  
16  package gate.creole;
17  
18  import gate.*;
19  import gate.creole.orthomatcher.*;
20  import gate.creole.ANNIEConstants;
21  import gate.util.*;
22  
23  import java.util.*;
24  import java.net.*;
25  import java.io.*;
26  
27  /** This class implements a APF xml exporter. It works on documents or corpora
28    * to export them in the APF format.
29    */
30  public class APFormatExporter extends AbstractLanguageAnalyser
31                                implements ANNIEConstants{
32    public static final String
33      APF_EXP_DOCUMENT_PARAMETER_NAME = "document";
34  
35    public static final String
36      APF_EXP_SOURCE_PARAMETER_NAME = "source";
37  
38    public static final String
39      APF_EXP_DTD_PARAMETER_NAME = "dtdFileName";
40  
41    public static final String
42      APF_EXP_PATH_PARAMETER_NAME = "exportFilePath";
43  
44    public static final String
45      APF_EXP_TYPES_PARAMETER_NAME = "exportedTypes";
46  
47    public static final String
48      APF_EXP_WRITE_SOURCE_PARAMETER_NAME = "isSourceWritten";
49  
50    /** Debug flag */
51    private static final boolean DEBUG = false;
52    /** Constructor does nothing. This PR is bean like initialized*/
53    public APFormatExporter() {}
54  
55    /** Run the resource and does the entire export process*/
56    public void execute() throws ExecutionException{
57      // Check if the thing can be run
58      if(document == null)
59        throw new ExecutionException("No document found to export in APF format!");
60      if (exportedTypes == null)
61        throw new ExecutionException("No export types found.");
62      xmlDoc = new StringBuffer(10*(document.getContent().size().intValue()));
63      initDocId();
64      if (docId == null)
65        throw new ExecutionException("Couldn't detect the document's ID");
66      if (DEBUG)
67        Out.prln("Document id = "+ docId);
68  
69      String exportFilePathStr = null;
70      if (exportFilePath == null)
71        exportFilePathStr = new String(document.getSourceUrl().getFile() +
72                                                                    ".apf.xml");
73      else
74        exportFilePathStr = exportFilePath.getPath()+ "/"
75            + gate.util.Files.getLastPathComponent(
76                document.getSourceUrl().getFile()) + ".apf.xml";
77  
78      if (DEBUG)
79        Out.prln("Export file path = "+ exportFilePathStr);
80  //*
81      // Prepare to write into the xmlFile
82      OutputStreamWriter writer = null;
83      try{
84        writer = new OutputStreamWriter(
85                new FileOutputStream(new File(exportFilePathStr)));
86  
87        // Write (test the toXml() method)
88        // This Action is added only when a gate.Document is created.
89        // So, is Bor sure that the resource is a gate.Document
90        serializeDocumentToAPF();
91        writer.write(xmlDoc.toString());
92        writer.flush();
93        writer.close();
94      }catch (Exception e){
95        throw new ExecutionException(e);
96      }// End try
97  //*/
98    } // execute()
99  
100 
101   /** Initialise this resource, and returns it. */
102   public Resource init() throws ResourceInstantiationException {
103     return this;
104   } // init()
105 
106   /** Java bean style mutator for exportedTypes */
107   public void setExportedTypes(List anExportedTypesList){
108     exportedTypes = anExportedTypesList;
109   }// setExportedTypes();
110 
111   /** Java bean style accesor for exportedTypes */
112   public List getExportedTypes(){
113     return exportedTypes;
114   }// getExportedTypes()
115 
116   /** Java bean style mutator for dtdFileName */
117   public void setDtdFileName(String aDtdFileName){
118     dtdFileName = aDtdFileName;
119   }// setDtdFileName();
120 
121   /** Java bean style accesor for DtdFileName */
122   public String getDtdFileName(){
123     return dtdFileName;
124   }// getDtdFileName()
125 
126   /** Java bean style mutator for exportFilePath */
127   public void setExportFilePath(URL anExportFilePath){
128     exportFilePath = anExportFilePath;
129   }// setExportFilePath();
130 
131   /** Java bean style accesor for exportFilePath */
132   public URL getExportFilePath(){
133     return exportFilePath;
134   }// getDtdFileName()
135 
136   /** Java bean style mutator for source */
137   public void setSource(String aSource){
138     source = aSource;
139   }// setSource();
140 
141   /** Java bean style accesor for source */
142   public String getSource(){
143     return source;
144   }// getSource()
145 
146   /** Java bean style accesor for isSourceWritten */
147   public Boolean getIsSourceWritten() {
148     return new Boolean(isSourceWritten);
149   }
150 
151   /** Java bean style mutator for isSourceWritten */
152   public void setIsSourceWritten(Boolean aIsSourceWritten){
153     isSourceWritten = aIsSourceWritten.booleanValue();
154   }// setIsSourceWritten();
155 
156 
157 
158   /** Initialises the docId with documents' file name without the complete path*/
159   private void initDocId(){
160     String fileName = "";
161     fileName = gate.util.Files.getLastPathComponent(
162                                             document.getSourceUrl().getFile());
163     // File name contains now the last token
164     if (DEBUG)
165       Out.prln("From initDocId, fileName ="+ fileName);
166     StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,".");
167     StringBuffer tmpDocId = new StringBuffer("");
168     while(fileNameTokenizer.hasMoreTokens()){
169       String token = (String)fileNameTokenizer.nextToken();
170       // We don't want to append the last token
171       if (fileNameTokenizer.hasMoreTokens())
172         tmpDocId.append(token + ".");
173     }// End while
174     // if tokenization had place
175     if (!"".equals(tmpDocId)){
176       // Remove the last dot
177       tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),"");
178       docId = tmpDocId.toString();
179     }// End if
180   }// initDocId()
181 
182   /** Returns the xml document conforming to APF dtd.*/
183   protected void serializeDocumentToAPF(){
184     xmlDoc.append("<?xml version=\"1.0\" ?>\n");
185     xmlDoc.append("<!DOCTYPE source_file SYSTEM ");
186        if (dtdFileName == null)
187       xmlDoc.append("\"ace-rdc.v2.0.1.dtd\"");
188          else
189            xmlDoc.append("\""+dtdFileName+"\"");
190     xmlDoc.append(">\n");
191     xmlDoc.append("<source_file TYPE=\"text\"");
192     if (isSourceWritten) {
193       AnnotationSet docTypeAnns = document.getAnnotations(
194         GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("DOCTYPE");
195       if (docTypeAnns == null || docTypeAnns.isEmpty())
196         xmlDoc.append(" SOURCE=\""+ source+ "\" ");
197       else {
198         Annotation docTypeAnn = (Annotation) docTypeAnns.iterator().next();
199         if (docTypeAnn.getFeatures().get("SOURCE") == null)
200           xmlDoc.append(" SOURCE=\""+ source+ "\" ");
201         else
202           xmlDoc.append(" SOURCE=\""+ docTypeAnn.getFeatures().get("SOURCE")+ "\" ");
203       }//if no doc type annotations
204     }
205     xmlDoc.append("VERSION=\"2.0\" URI=\"");
206     xmlDoc.append(docId);
207     xmlDoc.append("-lf\">\n");
208     xmlDoc.append("  <document DOCID=\"");
209     xmlDoc.append(docId + "\">\n");
210     serializeEntities();
211     xmlDoc.append("  </document>\n");
212     xmlDoc.append("</source_file>");
213   }// serializeDocumentToAPF()
214 
215   /** Transforms all the entities from exportedTypes found in the GATE document
216     * into their xml representation
217     */
218   protected void serializeEntities(){
219     // If no types founded then simply return
220     if (exportedTypes == null || exportedTypes.isEmpty()) return;
221 
222     Map entitiesMap = null;
223     if ( document.getFeatures() == null ||
224          document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null)
225       entitiesMap = new HashMap();
226     else
227       entitiesMap = (Map)document.getFeatures().
228                                         get(DOCUMENT_COREF_FEATURE_NAME);
229     Map namedAnnotSetMap = null;
230     if (document.getNamedAnnotationSets() == null)
231       namedAnnotSetMap = new HashMap();
232     else
233       namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets());
234     // Add the default annoattion set
235     namedAnnotSetMap.put(null,document.getAnnotations());
236     // The entities map is a map from annotation sets names to list of lists
237     // Each list element is composed from annotations refering the same entity
238     // All the entities that are in the exportedTypes need to be serialized.
239     Iterator exportedTypesIter = exportedTypes.iterator();
240     while(exportedTypesIter.hasNext()){
241       String entityType = (String)exportedTypesIter.next();
242       // Serialize all entities of type
243       // The keys in the entitesMap are annotation sets names. The null key
244       // designates the default annotation.
245       Set annotationSetNames = namedAnnotSetMap.keySet();
246       Iterator annotationSetNamesIter = annotationSetNames.iterator();
247       while (annotationSetNamesIter.hasNext()){
248         Object annotSetName = annotationSetNamesIter.next();
249         // This list contains entities found in the annotSetName
250         List entitiesList = (List) entitiesMap.get(annotSetName);
251         if (entitiesList == null) entitiesList = new ArrayList();
252         // This annotation set will contain all annotations of "entityType"
253         AnnotationSet annotSet = null;
254         Set serializationAnnotSet = null;
255         annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName);
256         if (annotSet == null || annotSet.get(entityType) == null) continue;
257         serializationAnnotSet = new HashSet(annotSet.get(entityType));
258         // All annotations from annotSet will be serialized as entities unless
259         // some of them are present in the entities map
260         // Now we are searching for the entityType in the entitiesMap and
261         // serialize it from there. After that, remove all annotations
262         // entityType present in entitiesMap from annotSet and serialize the
263         // remaining entities.
264         //Iterate through the entitiesList in searching for entityType
265         Iterator entitiesListIter = entitiesList.iterator();
266         while (entitiesListIter.hasNext()){
267           List entity = (List)entitiesListIter.next();
268           // We want now to accesate an annotation from the entity list to get
269           // its type and compare it with entityType
270           String theEntityType = new String("");
271           if (entity != null && !entity.isEmpty()){
272             Integer annotId = (Integer)entity.get(0);
273             Annotation a = (Annotation)annotSet.get(annotId);
274             if (a != null) theEntityType = a.getType();
275           }// End if
276           // The the types are equal then serialize the entities
277           if (theEntityType.equals(entityType)){
278             List ent = new ArrayList();
279             Iterator entityIter = entity.iterator();
280             while(entityIter.hasNext()){
281               Integer id = (Integer)entityIter.next();
282               ent.add(annotSet.get(id));
283             }// End while
284             serializeAnEntity(ent);
285             // Remove all annotation from entity that apear in annotSet
286             serializationAnnotSet.removeAll(ent);
287           }// End if
288         }// End while(entitiesListIter.hasNext())
289         // Serialize the remaining entities in annotSet
290         Iterator serializationAnnotSetIter = serializationAnnotSet.iterator();
291         while(serializationAnnotSetIter.hasNext()){
292           Annotation annotEntity = (Annotation) serializationAnnotSetIter.next();
293           List ent = new ArrayList();
294           ent.add(annotEntity);
295           serializeAnEntity(ent);
296         }// End while(annotSetIter.hasNext())
297       }// End while(entitiesKeysIter.hasNext())
298     }// End while(exportedTypesIter.hasNext())
299   }// serializeEntities()
300 
301   /** Writes an entity in the xmlDoc conforming to APF standards.
302     * @param anEntity represents a list with annotations that refer the same
303     * entity. Those annotations were detected and constructed by the
304     * orthomatcher.
305     */
306   private void serializeAnEntity(List anEntity){
307     if (anEntity == null || anEntity.isEmpty()) return;
308     // Write the entities tags
309     xmlDoc.append("  <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n");
310     // We know for sure that the list is not empty (see above)
311     Annotation a = (Annotation) anEntity.get(0);
312     xmlDoc.append("    <entity_type GENERIC=\"FALSE\">" + a.getType().toUpperCase() +
313      "</entity_type>\n");
314     // Write the entities mentions
315     Iterator anEntityIter = anEntity.iterator();
316     while(anEntityIter.hasNext()){
317       Annotation ann = (Annotation)anEntityIter.next();
318       serializeAnEntityMention(ann);
319     }// End while(anEntityIter.hasNext())
320     // Write the entities attributes
321     xmlDoc.append("      <entity_attributes>\n");
322     anEntityIter = anEntity.iterator();
323     while(anEntityIter.hasNext()){
324       Annotation ann = (Annotation)anEntityIter.next();
325       serializeAnEntityAttributes(ann);
326     }// End while(anEntityIter.hasNext())
327     xmlDoc.append("      </entity_attributes>\n");
328     xmlDoc.append("  </entity>\n");
329   }// End serializeAnEntity();
330 
331   /** This method serializes an entity mention from an Annotation*/
332   private void serializeAnEntityMention(Annotation ann){
333     if (ann == null) return;
334     String entityMentionType = "NAME";
335     String entityMentionRole = null;
336     String entityMentionReference = null;
337     String entityMentionGeneric = null;
338 
339     FeatureMap fm = ann.getFeatures();
340     if (fm != null){
341       if( null != fm.get("ENTITY_MENTION_TYPE"))
342         entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE");
343 
344       entityMentionRole = (String) fm.get("ROLE");
345       entityMentionReference = (String) fm.get("REFERENCE");
346       entityMentionGeneric = (String) fm.get("GENERIC");
347     }// End if
348     String str1 = (entityMentionRole == null)? "" :
349                              ("ROLE=\"" + entityMentionRole + "\"");
350     String str2 = (entityMentionReference == null)? "" :
351                              ("REFERENCE=\"" + entityMentionReference + "\"");
352     String str3 = (entityMentionGeneric == null)? "" :
353                              ("GENERIC=\"" + entityMentionGeneric + "\"");
354 
355 /* modified by Di - the new scorer needs a unique ID for each mention as well */
356 
357     xmlDoc.append("      <entity_mention TYPE=\"" + entityMentionType+"\"" +
358      str1 + " " + str2 + " " + str3 + "ID=\""  + "M" + getNextMentionId() +"\">\n"
359     );
360 
361     // extent
362     xmlDoc.append("        <extent>\n");
363     xmlDoc.append("          <charseq>\n");
364     try{
365       xmlDoc.append("          <!-- string = \"" +
366             document.getContent().getContent(ann.getStartNode().getOffset(),
367                                       ann.getEndNode().getOffset())+"\" -->\n");
368     }catch (InvalidOffsetException ioe){
369       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
370       " offsets:" + ann.getStartNode().getOffset() + " and "+
371       ann.getEndNode().getOffset());
372     }// End try
373     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
374         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
375     xmlDoc.append("          </charseq>\n");
376     xmlDoc.append("        </extent>\n");
377     // head
378     xmlDoc.append("        <head>\n");
379     xmlDoc.append("          <charseq>\n");
380     try{
381       xmlDoc.append("          <!-- string = \"" +
382             document.getContent().getContent(ann.getStartNode().getOffset(),
383                                       ann.getEndNode().getOffset())+"\" -->\n");
384     }catch (InvalidOffsetException ioe){
385       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
386       " offsets:" + ann.getStartNode().getOffset() + " and "+
387       ann.getEndNode().getOffset());
388     }// End try
389     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
390         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
391     xmlDoc.append("          </charseq>\n");
392     xmlDoc.append("        </head>\n");
393     xmlDoc.append("      </entity_mention>\n");
394   }//serializeAnEntityMention();
395 
396   /** This method serializes an entity attribute from an Annotation*/
397   private void serializeAnEntityAttributes(Annotation ann){
398     if (ann == null) return;
399     boolean isAttribute = false;
400     if ("NAME".equals(ann.getFeatures().get("ENTITY_MENTION_TYPE"))
401         ||
402        null == ann.getFeatures().get("ENTITY_MENTION_TYPE"))
403       isAttribute = true;
404     if (! isAttribute)
405       return;
406 
407     // name
408     xmlDoc.append("        <name>\n");
409     xmlDoc.append("          <charseq>\n");
410     try{
411       xmlDoc.append("          <!-- string = \"" +
412             document.getContent().getContent(ann.getStartNode().getOffset(),
413                                       ann.getEndNode().getOffset())+"\" -->\n");
414     }catch (InvalidOffsetException ioe){
415       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
416       " offsets:" + ann.getStartNode().getOffset() + " and "+
417       ann.getEndNode().getOffset());
418     }// End try
419     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
420         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
421     xmlDoc.append("          </charseq>\n");
422     xmlDoc.append("        </name>\n");
423   }//serializeAnEntityMention();
424 
425   /** Returns the next safe ID for an entity*/
426   private int getNextEntityId(){
427     return entityId ++;
428   }// getNextEntityId()
429 
430   /** added by  Di - returns the next safe ID for an entity mention */
431  private int getNextMentionId(){
432     return mentionId ++;
433   }
434 
435 
436   /** This list of strings represents the entities type that will be exported*/
437   private List exportedTypes = null;
438   /** This is the name of the dtd file. If it's not present no dtd would be
439     * written in the APF file.
440     */
441   private String dtdFileName = null;
442   /** This field represent the document id and it is used in generating the
443     * entities IDs. It is the file name of the document, without the extension
444     */
445   private String docId = null;
446 
447   /** This field represent an unique entity ID generator*/
448   private int entityId = 1;
449 
450     /** added by Di - this field represents a unique entity ID generator */
451     private int mentionId = 1;
452 
453   /** This is the xmlDoc that will be created*/
454   private StringBuffer xmlDoc = null;
455 
456   private URL exportFilePath = null;
457 
458   /** The source attribute for source*/
459   private String source = null;
460 
461   /** The source attribute for source*/
462   private boolean isSourceWritten = true;
463 
464 
465 }// APFormatExporter
466