1   /*
2    *  APFormatExporter.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/Oct/2001
12   *
13   *  $Id: APFormatExporter.java,v 1.15 2002/03/06 17:15:39 kalina Exp $
14   */
15  
16  package gate.creole;
17  
18  import gate.*;
19  import gate.creole.orthomatcher.*;
20  import gate.creole.ANNIEConstants;
21  import gate.util.*;
22  
23  import java.util.*;
24  import java.net.*;
25  import java.io.*;
26  
27  /** This class implements a APF xml exporter. It works on documents or corpora
28    * to export them in the APF format.
29    */
30  public class APFormatExporter extends AbstractLanguageAnalyser
31                                implements ANNIEConstants{
32    public static final String
33      APF_EXP_DOCUMENT_PARAMETER_NAME = "document";
34  
35    public static final String
36      APF_EXP_SOURCE_PARAMETER_NAME = "source";
37  
38    public static final String
39      APF_EXP_DTD_PARAMETER_NAME = "dtdFileName";
40  
41    public static final String
42      APF_EXP_PATH_PARAMETER_NAME = "exportFilePath";
43  
44    public static final String
45      APF_EXP_TYPES_PARAMETER_NAME = "exportedTypes";
46  
47    public static final String
48      APF_EXP_WRITE_SOURCE_PARAMETER_NAME = "isSourceWritten";
49  
50    /** Debug flag */
51    private static final boolean DEBUG = false;
52    /** Constructor does nothing. This PR is bean like initialized*/
53    public APFormatExporter() {}
54  
55    /** Run the resource and does the entire export process*/
56    public void execute() throws ExecutionException{
57      // Check if the thing can be run
58      if(document == null)
59        throw new ExecutionException("No document found to export in APF format!");
60      if (exportedTypes == null)
61        throw new ExecutionException("No export types found.");
62      xmlDoc = new StringBuffer(10*(document.getContent().size().intValue()));
63      initDocId();
64      if (docId == null)
65        throw new ExecutionException("Couldn't detect the document's ID");
66      if (DEBUG)
67        Out.prln("Document id = "+ docId);
68  
69      String exportFilePathStr = null;
70      if (exportFilePath == null)
71        exportFilePathStr = new String(document.getSourceUrl().getFile() +
72                                                                    ".apf.xml");
73      else
74        exportFilePathStr = exportFilePath.getPath()+ "/"+docId + ".apf.xml";
75  
76      if (DEBUG)
77        Out.prln("Export file path = "+ exportFilePathStr);
78  //*
79      // Prepare to write into the xmlFile
80      OutputStreamWriter writer = null;
81      try{
82        writer = new OutputStreamWriter(
83                new FileOutputStream(new File(exportFilePathStr)));
84  
85        // Write (test the toXml() method)
86        // This Action is added only when a gate.Document is created.
87        // So, is Bor sure that the resource is a gate.Document
88        serializeDocumentToAPF();
89        writer.write(xmlDoc.toString());
90        writer.flush();
91        writer.close();
92      }catch (Exception e){
93        throw new ExecutionException(e);
94      }// End try
95  //*/
96    } // execute()
97  
98  
99    /** Initialise this resource, and returns it. */
100   public Resource init() throws ResourceInstantiationException {
101     return this;
102   } // init()
103 
104   /** Java bean style mutator for exportedTypes */
105   public void setExportedTypes(List anExportedTypesList){
106     exportedTypes = anExportedTypesList;
107   }// setExportedTypes();
108 
109   /** Java bean style accesor for exportedTypes */
110   public List getExportedTypes(){
111     return exportedTypes;
112   }// getExportedTypes()
113 
114   /** Java bean style mutator for dtdFileName */
115   public void setDtdFileName(String aDtdFileName){
116     dtdFileName = aDtdFileName;
117   }// setDtdFileName();
118 
119   /** Java bean style accesor for DtdFileName */
120   public String getDtdFileName(){
121     return dtdFileName;
122   }// getDtdFileName()
123 
124   /** Java bean style mutator for exportFilePath */
125   public void setExportFilePath(URL anExportFilePath){
126     exportFilePath = anExportFilePath;
127   }// setExportFilePath();
128 
129   /** Java bean style accesor for exportFilePath */
130   public URL getExportFilePath(){
131     return exportFilePath;
132   }// getDtdFileName()
133 
134   /** Java bean style mutator for source */
135   public void setSource(String aSource){
136     source = aSource;
137   }// setSource();
138 
139   /** Java bean style accesor for source */
140   public String getSource(){
141     return source;
142   }// getSource()
143 
144   /** Java bean style accesor for isSourceWritten */
145   public Boolean getIsSourceWritten() {
146     return new Boolean(isSourceWritten);
147   }
148 
149   /** Java bean style mutator for isSourceWritten */
150   public void setIsSourceWritten(Boolean aIsSourceWritten){
151     isSourceWritten = aIsSourceWritten.booleanValue();
152   }// setIsSourceWritten();
153 
154 
155 
156   /** Initialises the docId with documents' file name without the complete path*/
157   private void initDocId(){
158     String fileName = "";
159     fileName = gate.util.Files.getLastPathComponent(
160                                             document.getSourceUrl().getFile());
161     // File name contains now the last token
162     if (DEBUG)
163       Out.prln("From initDocId, fileName ="+ fileName);
164     StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,".");
165     StringBuffer tmpDocId = new StringBuffer("");
166     while(fileNameTokenizer.hasMoreTokens()){
167       String token = (String)fileNameTokenizer.nextToken();
168       // We don't want to append the last token
169       if (fileNameTokenizer.hasMoreTokens())
170         tmpDocId.append(token + ".");
171     }// End while
172     // if tokenization had place
173     if (!"".equals(tmpDocId)){
174       // Remove the last dot
175       tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),"");
176       docId = tmpDocId.toString();
177     }// End if
178   }// initDocId()
179 
180   /** Returns the xml document conforming to APF dtd.*/
181   protected void serializeDocumentToAPF(){
182     xmlDoc.append("<?xml version=\"1.0\" ?>\n");
183     xmlDoc.append("<!DOCTYPE source_file SYSTEM ");
184     if (dtdFileName == null)
185       xmlDoc.append("\"ace-pilot-ref.dtd\"");
186     else
187       xmlDoc.append("\""+dtdFileName+"\"");
188     xmlDoc.append(">\n");
189     xmlDoc.append("<source_file TYPE=\"text\"");
190     if (isSourceWritten)
191       xmlDoc.append(" SOURCE=\""+ source+ "\"");
192     xmlDoc.append("VERSION=\"1.2\" URI=\"");
193     xmlDoc.append(docId);
194     xmlDoc.append("-lf\">\n");
195     xmlDoc.append("  <document DOCID=\"");
196     xmlDoc.append(docId + "\">\n");
197     serializeEntities();
198     xmlDoc.append("  </document>\n");
199     xmlDoc.append("</source_file>");
200   }// serializeDocumentToAPF()
201 
202   /** Transforms all the entities from exportedTypes found in the GATE document
203     * into their xml representation
204     */
205   protected void serializeEntities(){
206     // If no types founded then simply return
207     if (exportedTypes == null || exportedTypes.isEmpty()) return;
208 
209     Map entitiesMap = null;
210     if ( document.getFeatures() == null ||
211          document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null)
212       entitiesMap = new HashMap();
213     else
214       entitiesMap = (Map)document.getFeatures().
215                                         get(DOCUMENT_COREF_FEATURE_NAME);
216     Map namedAnnotSetMap = null;
217     if (document.getNamedAnnotationSets() == null)
218       namedAnnotSetMap = new HashMap();
219     else
220       namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets());
221     // Add the default annoattion set
222     namedAnnotSetMap.put(null,document.getAnnotations());
223     // The entities map is a map from annotation sets names to list of lists
224     // Each list element is composed from annotations refering the same entity
225     // All the entities that are in the exportedTypes need to be serialized.
226     Iterator exportedTypesIter = exportedTypes.iterator();
227     while(exportedTypesIter.hasNext()){
228       String entityType = (String)exportedTypesIter.next();
229       // Serialize all entities of type
230       // The keys in the entitesMap are annotation sets names. The null key
231       // designates the default annotation.
232       Set annotationSetNames = namedAnnotSetMap.keySet();
233       Iterator annotationSetNamesIter = annotationSetNames.iterator();
234       while (annotationSetNamesIter.hasNext()){
235         Object annotSetName = annotationSetNamesIter.next();
236         // This list contains entities found in the annotSetName
237         List entitiesList = (List) entitiesMap.get(annotSetName);
238         if (entitiesList == null) entitiesList = new ArrayList();
239         // This annotation set will contain all annotations of "entityType"
240         AnnotationSet annotSet = null;
241         Set serializationAnnotSet = null;
242         annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName);
243         if (annotSet == null || annotSet.get(entityType) == null) continue;
244         serializationAnnotSet = new HashSet(annotSet.get(entityType));
245         // All annotations from annotSet will be serialized as entities unless
246         // some of them are present in the entities map
247         // Now we are searching for the entityType in the entitiesMap and
248         // serialize it from there. After that, remove all annotations
249         // entityType present in entitiesMap from annotSet and serialize the
250         // remaining entities.
251         //Iterate through the entitiesList in searching for entityType
252         Iterator entitiesListIter = entitiesList.iterator();
253         while (entitiesListIter.hasNext()){
254           List entity = (List)entitiesListIter.next();
255           // We want now to accesate an annotation from the entity list to get
256           // its type and compare it with entityType
257           String theEntityType = new String("");
258           if (entity != null && !entity.isEmpty()){
259             Integer annotId = (Integer)entity.get(0);
260             Annotation a = (Annotation)annotSet.get(annotId);
261             if (a != null) theEntityType = a.getType();
262           }// End if
263           // The the types are equal then serialize the entities
264           if (theEntityType.equals(entityType)){
265             List ent = new ArrayList();
266             Iterator entityIter = entity.iterator();
267             while(entityIter.hasNext()){
268               Integer id = (Integer)entityIter.next();
269               ent.add(annotSet.get(id));
270             }// End while
271             serializeAnEntity(ent);
272             // Remove all annotation from entity that apear in annotSet
273             serializationAnnotSet.removeAll(ent);
274           }// End if
275         }// End while(entitiesListIter.hasNext())
276         // Serialize the remaining entities in annotSet
277         Iterator serializationAnnotSetIter = serializationAnnotSet.iterator();
278         while(serializationAnnotSetIter.hasNext()){
279           Annotation annotEntity = (Annotation) serializationAnnotSetIter.next();
280           List ent = new ArrayList();
281           ent.add(annotEntity);
282           serializeAnEntity(ent);
283         }// End while(annotSetIter.hasNext())
284       }// End while(entitiesKeysIter.hasNext())
285     }// End while(exportedTypesIter.hasNext())
286   }// serializeEntities()
287 
288   /** Writes an entity in the xmlDoc conforming to APF standards.
289     * @param anEntity represents a list with annotations that refer the same
290     * entity. Those annotations were detected and constructed by the
291     * orthomatcher.
292     */
293   private void serializeAnEntity(List anEntity){
294     if (anEntity == null || anEntity.isEmpty()) return;
295     // Write the entities tags
296     xmlDoc.append("  <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n");
297     // We know for sure that the list is not empty (see above)
298     Annotation a = (Annotation) anEntity.get(0);
299     xmlDoc.append("    <entity_type>" + a.getType().toUpperCase() +
300      "</entity_type>\n");
301     // Write the entities mentions
302     Iterator anEntityIter = anEntity.iterator();
303     while(anEntityIter.hasNext()){
304       Annotation ann = (Annotation)anEntityIter.next();
305       serializeAnEntityMention(ann);
306     }// End while(anEntityIter.hasNext())
307     // Write the entities attributes
308     xmlDoc.append("      <entity_attributes>\n");
309     anEntityIter = anEntity.iterator();
310     while(anEntityIter.hasNext()){
311       Annotation ann = (Annotation)anEntityIter.next();
312       serializeAnEntityAttributes(ann);
313     }// End while(anEntityIter.hasNext())
314     xmlDoc.append("      </entity_attributes>\n");
315     xmlDoc.append("  </entity>\n");
316   }// End serializeAnEntity();
317 
318   /** This method serializes an entity mention from an Annotation*/
319   private void serializeAnEntityMention(Annotation ann){
320     if (ann == null) return;
321     String entityMentionType = "NAME";
322     String entityMentionRole = null;
323     String entityMentionReference = null;
324     String entityMentionGeneric = null;
325 
326     FeatureMap fm = ann.getFeatures();
327     if (fm != null){
328       if( null != fm.get("ENTITY_MENTION_TYPE"))
329         entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE");
330 
331       entityMentionRole = (String) fm.get("ROLE");
332       entityMentionReference = (String) fm.get("REFERENCE");
333       entityMentionGeneric = (String) fm.get("GENERIC");
334     }// End if
335     String str1 = (entityMentionRole == null)? "" :
336                              ("ROLE=\"" + entityMentionRole + "\"");
337     String str2 = (entityMentionReference == null)? "" :
338                              ("REFERENCE=\"" + entityMentionReference + "\"");
339     String str3 = (entityMentionGeneric == null)? "" :
340                              ("GENERIC=\"" + entityMentionGeneric + "\"");
341 
342 
343     xmlDoc.append("      <entity_mention TYPE=\"" + entityMentionType+"\"" +
344      str1 + " " + str2 + " " + str3 + ">\n"
345     );
346     // extent
347     xmlDoc.append("        <extent>\n");
348     xmlDoc.append("          <charseq>\n");
349     try{
350       xmlDoc.append("          <!-- string = \"" +
351             document.getContent().getContent(ann.getStartNode().getOffset(),
352                                       ann.getEndNode().getOffset())+"\" -->\n");
353     }catch (InvalidOffsetException ioe){
354       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
355       " offsets:" + ann.getStartNode().getOffset() + " and "+
356       ann.getEndNode().getOffset());
357     }// End try
358     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
359         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
360     xmlDoc.append("          </charseq>\n");
361     xmlDoc.append("        </extent>\n");
362     // head
363     xmlDoc.append("        <head>\n");
364     xmlDoc.append("          <charseq>\n");
365     try{
366       xmlDoc.append("          <!-- string = \"" +
367             document.getContent().getContent(ann.getStartNode().getOffset(),
368                                       ann.getEndNode().getOffset())+"\" -->\n");
369     }catch (InvalidOffsetException ioe){
370       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
371       " offsets:" + ann.getStartNode().getOffset() + " and "+
372       ann.getEndNode().getOffset());
373     }// End try
374     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
375         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
376     xmlDoc.append("          </charseq>\n");
377     xmlDoc.append("        </head>\n");
378     xmlDoc.append("      </entity_mention>\n");
379   }//serializeAnEntityMention();
380 
381   /** This method serializes an entity attribute from an Annotation*/
382   private void serializeAnEntityAttributes(Annotation ann){
383     if (ann == null) return;
384     boolean isAttribute = false;
385     if ("NAME".equals(ann.getFeatures().get("ENTITY_MENTION_TYPE"))
386         ||
387        null == ann.getFeatures().get("ENTITY_MENTION_TYPE"))
388       isAttribute = true;
389     if (! isAttribute)
390       return;
391 
392     // name
393     xmlDoc.append("        <name>\n");
394     xmlDoc.append("          <charseq>\n");
395     try{
396       xmlDoc.append("          <!-- string = \"" +
397             document.getContent().getContent(ann.getStartNode().getOffset(),
398                                       ann.getEndNode().getOffset())+"\" -->\n");
399     }catch (InvalidOffsetException ioe){
400       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
401       " offsets:" + ann.getStartNode().getOffset() + " and "+
402       ann.getEndNode().getOffset());
403     }// End try
404     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
405         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
406     xmlDoc.append("          </charseq>\n");
407     xmlDoc.append("        </name>\n");
408   }//serializeAnEntityMention();
409 
410   /** Returns the next safe ID for an entity*/
411   private int getNextEntityId(){
412     return entityId ++;
413   }// getNextEntityId()
414 
415   /** This list of strings represents the entities type that will be exported*/
416   private List exportedTypes = null;
417   /** This is the name of the dtd file. If it's not present no dtd would be
418     * written in the APF file.
419     */
420   private String dtdFileName = null;
421   /** This field represent the document id and it is used in generating the
422     * entities IDs. It is the file name of the document, without the extension
423     */
424   private String docId = null;
425 
426   /** This field represent an unique entity ID generator*/
427   private int entityId = 1;
428   /** This is the xmlDoc that will be created*/
429   private StringBuffer xmlDoc = null;
430 
431   private URL exportFilePath = null;
432 
433   /** The source attribute for source*/
434   private String source = null;
435 
436   /** The source attribute for source*/
437   private boolean isSourceWritten = true;
438 
439 
440 }// APFormatExporter