1   /*
2    *  HtmlDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/May/2000
12   *
13   *  $Id: HtmlDocumentFormat.java,v 1.29 2002/02/05 12:50:31 nasso Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.io.*;
20  import java.net.*;
21  
22  // html tools
23  import javax.swing.text.html.*;
24  import javax.swing.text.html.parser.*;
25  import javax.swing.text.html.HTMLEditorKit.*;
26  //import javax.swing.text.*;
27  
28  import gate.util.*;
29  import gate.*;
30  import gate.html.*;
31  import gate.event.*;
32  import gate.creole.*;
33  
34  import org.w3c.www.mime.*;
35  
36  /** The format of Documents. Subclasses of DocumentFormat know about
37    * particular MIME types and how to unpack the information in any
38    * markup or formatting they contain into GATE annotations. Each MIME
39    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
40    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
41    * with a static index residing here when they are constructed. Static
42    * getDocumentFormat methods can then be used to get the appropriate
43    * format class for a particular document.
44    */
45  public class HtmlDocumentFormat extends TextualDocumentFormat
46  {
47  
48    /** Debug flag */
49    private static final boolean DEBUG = false;
50  
51    /** Default construction */
52    public HtmlDocumentFormat() { super(); }
53  
54    /** We could collect repositioning information during XML parsing */
55    public Boolean supportsRepositioning() {
56      return new Boolean(true);
57    } // supportsRepositioning
58  
59    /** Old style of unpackMarkup (without collecting of RepositioningInfo) */
60    public void unpackMarkup(Document doc) throws DocumentFormatException {
61      unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null);
62    } // unpackMarkup
63  
64    /** Unpack the markup in the document. This converts markup from the
65      * native format (e.g. HTML) into annotations in GATE format.
66      * Uses the markupElementsMap to determine which elements to convert, and
67      * what annotation type names to use.
68      * It always tryes to parse te doc's content. It doesn't matter if the
69      * sourceUrl is null or not.
70      *
71      * @param Document doc The gate document you want to parse.
72      *
73      */
74    public void unpackMarkup(Document doc, RepositioningInfo repInfo,
75                RepositioningInfo ampCodingInfo) throws DocumentFormatException{
76      Reader                reader = null;
77      URLConnection         conn = null;
78      PrintWriter           out = null;
79      HTMLEditorKit.Parser  parser = new ParserDelegator();
80  
81      if ( doc == null || doc.getContent() == null ){
82        throw new DocumentFormatException(
83                 "GATE document is null or no content found. Nothing to parse!");
84      }// End if
85  
86      reader = new InputStreamReader(
87               new ByteArrayInputStream(doc.getContent().toString().getBytes()));
88  
89      // create a new Htmldocument handler
90      HtmlDocumentHandler htmlDocHandler = new
91                             HtmlDocumentHandler(doc, this.markupElementsMap);
92      // Create a Status Listener
93      StatusListener statusListener = new StatusListener(){
94        public void statusChanged(String text){
95          fireStatusChanged(text);
96        }
97      };
98      // Register the listener with htmlDocHandler
99      htmlDocHandler.addStatusListener(statusListener);
100     // set repositioning object
101     htmlDocHandler.setRepositioningInfo(repInfo);
102     // set the object with ampersand coding positions
103     htmlDocHandler.setAmpCodingInfo(ampCodingInfo);
104 
105     try{
106       // parse the HTML document
107       parser.parse(reader, htmlDocHandler, true);
108     } catch (IOException e){
109       throw new DocumentFormatException(e);
110     }finally{
111       if (htmlDocHandler != null)
112         htmlDocHandler.removeStatusListener(statusListener);
113     }// End try
114   }//unpackMarkup(doc)
115 
116   /** Initialise this resource, and return it. */
117   public Resource init() throws ResourceInstantiationException{
118     // Register HTML mime type
119     MimeType mime = new MimeType("text","html");
120     // Register the class handler for this mime type
121     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
122                                                                           this);
123     // Register the mime type with mine string
124     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
125     // Register file sufixes for this mime type
126     suffixes2mimeTypeMap.put("html",mime);
127     suffixes2mimeTypeMap.put("htm",mime);
128     // Register magic numbers for this mime type
129     magic2mimeTypeMap.put("<html",mime);
130     // Set the mimeType for this language resource
131     setMimeType(mime);
132     return this;
133   }// init()
134 }// class HtmlDocumentFormat
135