1   /*
2    *  SgmlDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 4/July/2000
12   *
13   *  $Id: SgmlDocumentFormat.java,v 1.25 2001/11/30 14:38:44 cursu Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.io.*;
20  import java.net.*;
21  
22  import gate.util.*;
23  import gate.*;
24  import gate.sgml.*;
25  import gate.event.*;
26  import gate.xml.*;
27  import gate.creole.*;
28  
29  import org.w3c.www.mime.*;
30  // xml tools
31  import javax.xml.parsers.*;
32  import org.xml.sax.*;
33  
34  /** The format of Documents. Subclasses of DocumentFormat know about
35    * particular MIME types and how to unpack the information in any
36    * markup or formatting they contain into GATE annotations. Each MIME
37    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
38    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
39    * with a static index residing here when they are constructed. Static
40    * getDocumentFormat methods can then be used to get the appropriate
41    * format class for a particular document.
42    */
43  public class SgmlDocumentFormat extends TextualDocumentFormat
44  {
45    /** Debug flag */
46    private static final boolean DEBUG = false;
47  
48    /** Default construction */
49    public SgmlDocumentFormat() { super(); }
50  
51    /** Unpack the markup in the document. This converts markup from the
52      * native format (e.g. SGML) into annotations in GATE format.
53      * Uses the markupElementsMap to determine which elements to convert, and
54      * what annotation type names to use.
55      * The doc's content is first converted to a wel formed XML.
56      * If this succeddes then the document is saved into a temp file and parsed
57      * as an XML document.
58      *
59      * @param Document doc The gate document you want to parse.
60      *
61      */
62    public void unpackMarkup(Document doc) throws DocumentFormatException{
63      if ( (doc == null) ||
64           (doc.getSourceUrl() == null && doc.getContent() == null)){
65  
66        throw new DocumentFormatException(
67                 "GATE document is null or no content found. Nothing to parse!");
68      }// End if
69      // Create a status listener
70      StatusListener statusListener = new StatusListener(){
71              public void statusChanged(String text){
72                fireStatusChanged(text);
73              }
74      };
75      XmlDocumentHandler xmlDocHandler = null;
76      try {
77        Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
78  
79        fireStatusChanged("Performing SGML to XML...");
80  
81        // convert the SGML document
82        String xmlUri = sgml2Xml.convert();
83  
84        fireStatusChanged("DONE !");
85  
86        //Out.println("Conversion done..." + xmlUri);
87        //Out.println(sgml2Xml.convert());
88        // Get a parser factory.
89        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
90        // Set up the factory to create the appropriate type of parser
91  
92        // Set up the factory to create the appropriate type of parser
93        // non validating one
94        saxParserFactory.setValidating(false);
95        // non namesapace aware one
96        saxParserFactory.setNamespaceAware(true);
97  
98        // Create a SAX parser
99        SAXParser parser = saxParserFactory.newSAXParser();
100 
101       // use it
102       if (null != doc){
103         // create a new Xml document handler
104         xmlDocHandler = new XmlDocumentHandler(doc,
105                                                this.markupElementsMap,
106                                                this.element2StringMap);
107 
108         // register a status listener with it
109         xmlDocHandler.addStatusListener(statusListener);
110 
111         parser.parse(xmlUri, xmlDocHandler);
112         ((DocumentImpl) doc).setNextAnnotationId(
113                                           xmlDocHandler.getCustomObjectsId());
114      }// end if
115     } catch (ParserConfigurationException e){
116         throw
117         new DocumentFormatException("XML parser configuration exception ", e);
118     } catch (SAXException e){
119         throw new DocumentFormatException(e);
120     } catch (IOException e){
121         throw new DocumentFormatException("I/O exception for " +
122                                       doc.getSourceUrl().toString());
123     }finally{
124       if (xmlDocHandler != null)
125         xmlDocHandler.removeStatusListener(statusListener);
126     }// End try
127 
128   }// unpackMarkup
129 
130   /** This method converts the document's content from SGML 2 XML.*/
131   private String sgml2Xml(Document doc) {
132     String xmlUri = doc.getSourceUrl().toString ();
133 
134     return xmlUri;
135   }// sgml2Xml()
136 
137   /** Initialise this resource, and return it. */
138   public Resource init() throws ResourceInstantiationException{
139     // Register SGML mime type
140     MimeType mime = new MimeType("text","sgml");
141     // Register the class handler for this mime type
142     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
143                                                                           this);
144     // Register the mime type with mine string
145     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
146     // Register file sufixes for this mime type
147     suffixes2mimeTypeMap.put("sgm",mime);
148     suffixes2mimeTypeMap.put("sgml",mime);
149     setMimeType(mime);
150     return this;
151   }// init
152 
153 }//class SgmlDocumentFormat
154