1   /*
2    *  RtfDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/July/2000
12   *
13   *  $Id: RtfDocumentFormat.java,v 1.19 2004/07/21 17:10:03 akshay Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.*;
19  
20  import javax.swing.text.*;
21  import javax.swing.text.rtf.RTFEditorKit;
22  
23  import gate.Resource;
24  import gate.creole.ResourceInstantiationException;
25  import gate.util.DocumentFormatException;
26  //import org.w3c.www.mime.*;
27  
28  /** The format of Documents. Subclasses of DocumentFormat know about
29    * particular MIME types and how to unpack the information in any
30    * markup or formatting they contain into GATE annotations. Each MIME
31    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
32    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
33    * with a static index residing here when they are constructed. Static
34    * getDocumentFormat methods can then be used to get the appropriate
35    * format class for a particular document.
36    */
37  public class RtfDocumentFormat extends TextualDocumentFormat{
38  
39    /** Debug flag */
40    private static final boolean DEBUG = false;
41  
42    /** Default construction */
43    public RtfDocumentFormat() { super(); }
44  
45    /** Unpack the markup in the document. This converts markup from the
46      * native format (e.g.RTF) into annotations in GATE format.
47      * Uses the markupElementsMap to determine which elements to convert, and
48      * what annotation type names to use.
49      * It always tryes to parse te doc's content. It doesn't matter if the
50      * sourceUrl is null or not.
51      *
52      * @param doc The gate document you want to parse.
53      *
54      */
55    public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
56  
57      if ( (doc == null) ||
58           (doc.getSourceUrl() == null && doc.getContent() == null)){
59  
60        throw new DocumentFormatException(
61                 "GATE document is null or no content found. Nothing to parse!");
62      }// End if
63  
64      // create a RTF editor kit
65      RTFEditorKit aRtfEditorkit = new RTFEditorKit();
66  
67      // create a Styled Document
68      // NOTE that RTF Kit works only with Systled Document interface
69      StyledDocument styledDoc = new DefaultStyledDocument();
70  
71      // get an Input stream from the gate document
72      InputStream in = new ByteArrayInputStream(
73                                           doc.getContent().toString().getBytes()
74                                           );
75  
76      try {
77        aRtfEditorkit.read(in, styledDoc, 0);
78        // replace the document content with the one without markups
79        doc.setContent(new DocumentContentImpl(
80                                        styledDoc.getText(0,styledDoc.getLength())
81                                              )
82                      );
83      } catch (BadLocationException e) {
84        throw new DocumentFormatException(e);
85      } catch (IOException e){
86        throw new DocumentFormatException("I/O exception for " +
87                                          doc.getSourceUrl().toExternalForm(),e);
88      }
89    } // unpackMarkup(doc)
90  
91    /** Initialise this resource, and return it. */
92    public Resource init() throws ResourceInstantiationException{
93      // Register RTF mime type
94      MimeType mime = new MimeType("text","rtf");
95      // Register the class handler for this mime type
96      mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
97                                                                            this);
98      // Register the mime type with mine string
99      mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
100     // Register file sufixes for this mime type
101     suffixes2mimeTypeMap.put("rtf",mime);
102     // Register magic numbers for this mime type
103     magic2mimeTypeMap.put("{\\rtf1",mime);
104     // Set the mimeType for this language resource
105     setMimeType(mime);
106     return this;
107   }// init()
108 }// class RtfDocumentFormat
109