1   /*
2    *  DocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 25/May/2000
12   *
13   *  $Id: DocumentFormat.java,v 1.48 2003/01/10 08:18:31 nasso Exp $
14   */
15  
16  package gate;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  
22  import gate.util.*;
23  import gate.event.*;
24  import gate.creole.*;
25  import gate.corpora.RepositioningInfo;
26  
27  import org.w3c.www.mime.*;
28  
29  
30  /** The format of Documents. Subclasses of DocumentFormat know about
31    * particular MIME types and how to unpack the information in any
32    * markup or formatting they contain into GATE annotations. Each MIME
33    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
34    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
35    * with a static index residing here when they are constructed. Static
36    * getDocumentFormat methods can then be used to get the appropriate
37    * format class for a particular document.
38    */
39  public abstract class DocumentFormat
40  extends AbstractLanguageResource implements LanguageResource{
41    /** Debug flag */
42    private static final boolean DEBUG = false;
43  
44    /** This fields indicates whether the document being processed is in a
45      * Gate XML custom format.
46      * Detection is done in runMagicNumbers().
47      */
48    protected static boolean isGateXmlDocument = false;
49  
50    /** The MIME type of this format. */
51    private MimeType mimeType = null;
52  
53    /** Map of MimeTypeString to ClassHandler class. This is used to find the
54      * language resource that deals with the specific Document format
55      */
56    protected static Map mimeString2ClassHandlerMap = new HashMap();
57    /** Map of MimeType to DocumentFormat Class. This is used to find the
58      * DocumentFormat subclass that deals with a particular MIME type.
59      */
60    protected static Map mimeString2mimeTypeMap = new HashMap();
61  
62    /** Map of Set of file suffixes to MimeType. This is used to figure
63      * out what MIME type a document is from its file name.
64      */
65    protected static Map suffixes2mimeTypeMap = new HashMap();
66  
67    /** Map of Set of magic numbers to MimeType. This is used to guess the
68      * MIME type of a document, when we don't have any other clues.
69      */
70    protected static Map magic2mimeTypeMap = new HashMap();
71  
72    /** Map of markup elements to annotation types. If it is null, the
73      * unpackMarkup() method will convert all markup, using the element names
74      * for annotation types. If it is non-null, only those elements specified
75      * here will be converted.
76      */
77    protected Map markupElementsMap = null;
78  
79    /** This map is used inside uppackMarkup() method...
80      * When an element from the map is encounted, The corresponding string
81      * element is added to the document content
82      */
83    protected Map element2StringMap = null;
84  
85    /** The features of this resource */
86    private FeatureMap features = null;
87  
88    /** Default construction */
89    public DocumentFormat() {}
90  
91    /** listeners for status report */
92    private transient Vector statusListeners;
93  
94    /** Flag for enable/disable collecting of repositioning information */
95    private Boolean shouldCollectRepositioning = new Boolean(false);
96  
97    /** If the document format could collect repositioning information
98     *  during the unpack phase this method will return <B>true</B>.
99     *  <BR>
100    *  You should override this method in the child class of the defined
101    *  document format if it could collect the repositioning information.
102    */
103   public Boolean supportsRepositioning() {
104     return new Boolean(false);
105   } // supportsRepositioning
106 
107   public void setShouldCollectRepositioning(Boolean b) {
108     if(supportsRepositioning().booleanValue() && b.booleanValue()) {
109       shouldCollectRepositioning = b;
110     }
111     else {
112       shouldCollectRepositioning = new Boolean(false);
113     } // if
114   } // setShouldCollectRepositioning
115 
116   public Boolean getShouldCollectRepositioning() {
117     return shouldCollectRepositioning;
118   } //
119 
120   /** Unpack the markup in the document. This converts markup from the
121     * native format (e.g. XML, RTF) into annotations in GATE format.
122     * Uses the markupElementsMap to determine which elements to convert, and
123     * what annotation type names to use.
124     */
125   abstract public void unpackMarkup(Document doc)
126                                       throws DocumentFormatException;
127 
128   abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo,
129                                         RepositioningInfo ampCodingInfo)
130                                       throws DocumentFormatException;
131   /** Unpack the markup in the document. This method calls unpackMarkup on the
132     * GATE document, but after it saves its content as a feature atached to
133     * the document. This method is usefull if one wants to save the content
134     * of the document being unpacked. After the markups have been unpacked,
135     * the content of the document will be replaced with a new one containing
136     * the text between markups.
137     *
138     * @param doc the document that will be upacked
139     * @param originalContentFeatureType the name of the feature that will hold
140     * the document's content.
141     */
142   public void unpackMarkup( Document doc,
143                             String  originalContentFeatureType )
144                                               throws DocumentFormatException{
145      FeatureMap fm = doc.getFeatures();
146      if (fm == null) fm = Factory.newFeatureMap();
147      fm.put(originalContentFeatureType, doc.getContent().toString());
148      doc.setFeatures(fm);
149      unpackMarkup(doc);
150   }// unpackMarkup();
151 
152   /**
153     * Returns a MimeType having as input a fileSufix.
154     * If the file sufix is <b>null</b> or not recognised then,
155     * <b>null</b> will be returned.
156     * @param fileSufix The file sufix associated with a recognisabe mime type.
157     * @return The MimeType associated with this file suffix.
158     */
159   static private MimeType  getMimeType(String fileSufix){
160     // Get a mimeType string associated with this fileSuffix
161     // Eg: for html returns  MimeType("text/html"), for xml returns
162     // MimeType("text/xml")
163     if(fileSufix == null) return null;
164     return  (MimeType) suffixes2mimeTypeMap.get(fileSufix.toLowerCase());
165   }//getMimeType
166 
167   /**
168     * Returns a MymeType having as input a URL object. If the MimeType wasn't
169     * recognized it returns <b>null</b>.
170     * @param url The URL object from which the MimeType will be extracted
171     * @return A MimeType object for that URL, or <b>null</b> if the Mime Type is
172     * unknown.
173     */
174   static private MimeType  getMimeType(URL url) {
175     String mimeTypeString = null;
176     String charsetFromWebServer = null;
177     String contentType = null;
178     InputStream is = null;
179     MimeType mimeTypeFromWebServer = null;
180     MimeType mimeTypeFromFileSuffix = null;
181     MimeType mimeTypeFromMagicNumbers = null;
182     String fileSufix = null;
183 
184     if (url == null)
185       return null;
186     // Ask the web server for the content type
187     // We expect to get contentType something like this:
188     // "text/html; charset=iso-8859-1"
189     // Charset is optional
190     try{
191       is = url.openConnection().getInputStream();
192       contentType = url.openConnection().getContentType();
193     } catch (IOException e){
194       // Failed to get the content type with te Web server.
195       // Let's try some other methods like FileSuffix or magic numbers.
196     }
197     // If a content Type was returned by the server, try to get the mime Type
198     // string
199     // If contentType is something like this:"text/html; charset=iso-8859-1"
200     // try to get content Type string (text/html)
201     if (contentType != null){
202       StringTokenizer st = new StringTokenizer(contentType, ";");
203       // We assume that the first token is the mime type string...
204       // If this doesn't happen then BAD LUCK :(( ...
205       if (st.hasMoreTokens())
206         mimeTypeString     = st.nextToken().toLowerCase();
207       // The next token it should be the CharSet
208       if (st.hasMoreTokens())
209         charsetFromWebServer = st.nextToken().toLowerCase();
210       if (charsetFromWebServer != null){
211         //We have something like : "charset=iso-8859-1" and let's extract the
212         // encoding.
213         st = new StringTokenizer(charsetFromWebServer, "=");
214         // Don't need this anymore
215         charsetFromWebServer = null;
216         // Discarding the first token which is : "charset"
217         if (st.hasMoreTokens())
218           st.nextToken().toUpperCase();
219         // Get the encoding : "ISO-8859-1"
220         if (st.hasMoreTokens())
221           charsetFromWebServer = st.nextToken().toUpperCase();
222       } // End if
223     }// end if
224     // Return the corresponding MimeType with WebServer from the associated MAP
225     mimeTypeFromWebServer = (MimeType)
226                                 mimeString2mimeTypeMap.get(mimeTypeString);
227     // Let's try a file suffix detection
228     // Get the file sufix from the URL.See method definition for more details
229     fileSufix = getFileSufix(url);
230     // Get the mime type based on the on file sufix
231     mimeTypeFromFileSuffix = getMimeType(fileSufix);
232 
233     // Let's perform a magic numbers guess..
234     mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is,
235                                                     charsetFromWebServer);
236     //All those types enter into a deciding system
237     return decideBetweenThreeMimeTypes( mimeTypeFromWebServer,
238                                         mimeTypeFromFileSuffix,
239                                         mimeTypeFromMagicNumbers);
240   }//getMimeType
241 
242   /**
243     * This method decides what mimeType is in majority
244     * @param aMimeTypeFromWebServer a MimeType
245     * @param aMimeTypeFromFileSuffix a MimeType
246     * @param aMimeTypeFromMagicNumbers a MimeType
247     * @return the MimeType which occurs most. If all are null, then returns
248     * <b>null</b>
249     */
250   protected static MimeType decideBetweenThreeMimeTypes(
251                                     MimeType aMimeTypeFromWebServer,
252                                     MimeType aMimeTypeFromFileSuffix,
253                                     MimeType aMimeTypeFromMagicNumbers){
254 
255     // First a voting system
256     if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix))
257       return aMimeTypeFromFileSuffix;
258     if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers))
259       return aMimeTypeFromFileSuffix;
260     if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers))
261       return aMimeTypeFromWebServer;
262 
263     // 1 is the highest priority
264     if (aMimeTypeFromFileSuffix != null)
265       aMimeTypeFromFileSuffix.addParameter("Priority","1");
266     // 2 is the second priority
267     if (aMimeTypeFromWebServer != null)
268       aMimeTypeFromWebServer.addParameter("Priority","2");
269     // 3 is the third priority
270     if (aMimeTypeFromMagicNumbers != null)
271       aMimeTypeFromMagicNumbers.addParameter("Priority","3");
272 
273     return decideBetweenTwoMimeTypes(
274                              decideBetweenTwoMimeTypes(aMimeTypeFromWebServer,
275                                                        aMimeTypeFromFileSuffix),
276                              aMimeTypeFromMagicNumbers);
277 
278   }// decideBetweenThreeMimeTypes
279 
280   /** Decide between two mimeTypes. The decistion is made on "Priority"
281     * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes
282     * doesn't have "Priority" paramether set, it will return one on them.
283     * @param aMimeType a MimeType object with "Prority" parameter set
284     * @param anotherMimeType a MimeType object with "Prority" parameter set
285     * @return One of the two mime types.
286     */
287   protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType,
288                                                 MimeType anotherMimeType){
289     if (aMimeType == null) return anotherMimeType;
290     if (anotherMimeType == null) return aMimeType;
291 
292     int priority1 = 0;
293     int priority2 = 0;
294     // Both of them are not null
295     if (aMimeType.hasParameter("Priority"))
296       try{
297         priority1 =
298               new Integer(aMimeType.getParameterValue("Priority")).intValue();
299       }catch (NumberFormatException e){
300         return anotherMimeType;
301       }
302     if (anotherMimeType.hasParameter("Priority"))
303       try{
304         priority2 =
305           new Integer(anotherMimeType.getParameterValue("Priority")).intValue();
306       }catch (NumberFormatException e){
307         return aMimeType;
308       }
309 
310     // The lower the number, the highest the priority
311     if (priority1 <= priority2)
312       return aMimeType;
313     else
314       return anotherMimeType;
315   }// decideBetweenTwoMimeTypes
316 
317   /**
318     * Tests if two MimeType objects are equal.
319     * @return true only if boths MimeType objects are different than <b>null</b>
320     * and their Types and Subtypes are equals. The method is case sensitive.
321     */
322   protected static boolean areEqual( MimeType aMimeType,
323                                      MimeType anotherMimeType){
324     if (aMimeType == null || anotherMimeType == null)
325       return false;
326 
327     if ( aMimeType.getType().equals(anotherMimeType.getType()) &&
328          aMimeType.getSubtype().equals(anotherMimeType.getSubtype())
329        ) return true;
330     else
331       return false;
332   }// are Equal
333 
334   /**
335     * This method tries to guess the mime Type using some magic numbers.
336     * @param aInputStream a InputStream which has to be transformed into a
337     *        InputStreamReader
338     * @param anEncoding the encoding. If is null or unknown then a
339     * InputStreamReader with default encodings will be created.
340     * @return the mime type associated with magic numbers
341     */
342   protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream,
343                                                             String anEncoding){
344 
345     if (aInputStream == null) return null;
346     InputStreamReader reader = null;
347     if (anEncoding != null)
348       try{
349         reader = new InputStreamReader(aInputStream, anEncoding);
350       } catch (UnsupportedEncodingException e){
351         reader = null;
352       }
353     if (reader == null)
354       // Create a reader with the default encoding system
355       reader = new InputStreamReader(aInputStream);
356 
357     // We have a input stream reader
358     return runMagicNumbers(reader);
359   }//guessTypeUsingMagicNumbers
360 
361   /** Performs magic over Gate Document */
362   protected static MimeType runMagicNumbers(InputStreamReader aReader){
363     // No reader, nothing to detect
364     if( aReader == null) return null;
365 
366     // Prepare to run the magic stuff
367     String strBuffer = null;
368     int bufferSize = 2048;
369     int charReads = 0;
370     char[] cbuf = new char[bufferSize];
371 
372     try {
373       charReads = aReader.read(cbuf,0,bufferSize);
374     } catch (IOException e){
375       return null;
376     }// End try
377 
378     if (charReads == -1)
379       // the document is empty
380       return null;
381 
382     // Create a string form the buffer and perform some search on it.
383     strBuffer = new String(cbuf,0,charReads);
384 
385     // If this fails then surrender
386     return getTypeFromContent(strBuffer);
387   }// runMagicNumbers
388 
389   private static MimeType getTypeFromContent(String aContent){
390     MimeType detectedMimeType = null;
391     // Detect whether or not is a GateXmlDocument
392     if (  aContent.indexOf("<GateDocument") != -1  ||
393           aContent.indexOf(" GateDocument") != -1)
394       isGateXmlDocument = true;
395     else
396       isGateXmlDocument = false;
397 
398     // Run the magic numbers test
399     Set magicSet = magic2mimeTypeMap.keySet();
400     Iterator iterator=magicSet.iterator();
401     String magic;
402     // change case to cover more variants
403     aContent = aContent.toLowerCase();
404     while (iterator.hasNext()){
405       magic = ((String) iterator.next()).toLowerCase();
406       if (aContent.indexOf(magic) != -1)
407         detectedMimeType = (MimeType) magic2mimeTypeMap.get(magic);
408     }// End while
409 
410     // If this fails then surrender
411     return detectedMimeType;
412   }// getTypeFromContent
413   
414   /**
415     * Return the fileSuffix or null if the url doesn't have a file suffix
416     * If the url is null then the file suffix will be null also
417     */
418   private static String getFileSufix(URL url){
419     String fileName = null;
420     String fileSuffix = null;
421 
422     // GIGO test  (garbage in garbage out)
423     if (url != null){
424       // get the file name from the URL
425       fileName = url.getFile();
426 
427       // tokenize this file name with "." as separator...
428       // the last token will be the file suffix
429       StringTokenizer st = new StringTokenizer(fileName,".");
430 
431       // fileSuffix is the last token
432       while (st.hasMoreTokens())
433         fileSuffix = st.nextToken();
434       // here fileSuffix is the last token
435     } // End if
436     return fileSuffix;
437   }//getFileSufix
438 
439   /**
440     * Find a DocumentFormat implementation that deals with a particular
441     * MIME type, given that type.
442     * @param  aGateDocument this document will receive as a feature
443     *                      the associated Mime Type. The name of the feature is
444     *                      MimeType and its value is in the format type/subtype
445     * @param  mimeType the mime type that is given as input
446     */
447   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
448                                                             MimeType mimeType){
449     FeatureMap      aFeatureMap    = null;
450     if(mimeType == null) {
451       String content = aGateDocument.getContent().toString();
452       // reduce size for better performance
453       if(content.length() > 2048) content = content.substring(0, 2048);
454       mimeType = getTypeFromContent( content );
455     }
456     
457     if (mimeType != null){
458       // If the Gate Document doesn't have a feature map atached then
459       // We will create and set one.
460       if(aGateDocument.getFeatures() == null){
461             aFeatureMap = Factory.newFeatureMap();
462             aGateDocument.setFeatures(aFeatureMap);
463       }// end if
464       aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" +
465                                           mimeType.getSubtype());
466 
467       return (DocumentFormat) mimeString2ClassHandlerMap.get(mimeType.getType()
468                                                + "/" + mimeType.getSubtype());
469     }// end If
470     return null;
471   } // getDocumentFormat(aGateDocument, MimeType)
472 
473   /**
474     * Find a DocumentFormat implementation that deals with a particular
475     * MIME type, given the file suffix (e.g. ".txt") that the document came
476     * from.
477     * @param  aGateDocument this document will receive as a feature
478     *                     the associated Mime Type. The name of the feature is
479     *                     MimeType and its value is in the format type/subtype
480     * @param  fileSuffix the file suffix that is given as input
481     */
482   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
483                                                             String fileSuffix) {
484     return getDocumentFormat(aGateDocument, getMimeType(fileSuffix));
485   } // getDocumentFormat(String)
486 
487   /**
488     * Find a DocumentFormat implementation that deals with a particular
489     * MIME type, given the URL of the Document. If it is an HTTP URL, we
490     * can ask the web server. If it has a recognised file extension, we
491     * can use that. Otherwise we need to use a map of magic numbers
492     * to MIME types to guess the type, and then look up the format using the
493     * type.
494     * @param  aGateDocument this document will receive as a feature
495     *                      the associated Mime Type. The name of the feature is
496     *                      MimeType and its value is in the format type/subtype
497     * @param  url  the URL that is given as input
498     */
499   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
500                                                                       URL url) {
501     return getDocumentFormat(aGateDocument, getMimeType(url));
502   } // getDocumentFormat(URL)
503 
504   /** Get the feature set */
505   public FeatureMap getFeatures() { return features; }
506 
507    /** Get the markup elements map */
508   public Map getMarkupElementsMap() { return markupElementsMap; }
509 
510    /** Get the element 2 string map */
511   public Map getElement2StringMap() { return element2StringMap; }
512 
513   /** Set the markup elements map */
514   public void setMarkupElementsMap(Map markupElementsMap) {
515    this.markupElementsMap = markupElementsMap;
516   }
517 
518   /** Set the element 2 string map */
519   public void setElement2StringMap(Map anElement2StringMap) {
520    element2StringMap = anElement2StringMap;
521   }
522 
523   /** Set the features map*/
524   public void setFeatures(FeatureMap features){this.features = features;}
525 
526   /** Set the mime type*/
527 
528   public void setMimeType(MimeType aMimeType){mimeType = aMimeType;}
529   /** Gets the mime Type*/
530   public MimeType getMimeType(){return mimeType;}
531 
532   //StatusReporter Implementation
533 
534 
535   public synchronized void removeStatusListener(StatusListener l) {
536     if (statusListeners != null && statusListeners.contains(l)) {
537       Vector v = (Vector) statusListeners.clone();
538       v.removeElement(l);
539       statusListeners = v;
540     }
541   }
542   public synchronized void addStatusListener(StatusListener l) {
543     Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
544     if (!v.contains(l)) {
545       v.addElement(l);
546       statusListeners = v;
547     }
548   }
549   protected void fireStatusChanged(String e) {
550     if (statusListeners != null) {
551       Vector listeners = statusListeners;
552       int count = listeners.size();
553       for (int i = 0; i < count; i++) {
554         ((StatusListener) listeners.elementAt(i)).statusChanged(e);
555       }
556     }
557   }
558 
559 } // class DocumentFormat
560