|
SgmlDocumentFormat |
|
1 /* 2 * SgmlDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 4/July/2000 12 * 13 * $Id: SgmlDocumentFormat.java,v 1.25 2001/11/30 14:38:44 cursu Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 22 import gate.util.*; 23 import gate.*; 24 import gate.sgml.*; 25 import gate.event.*; 26 import gate.xml.*; 27 import gate.creole.*; 28 29 import org.w3c.www.mime.*; 30 // xml tools 31 import javax.xml.parsers.*; 32 import org.xml.sax.*; 33 34 /** The format of Documents. Subclasses of DocumentFormat know about 35 * particular MIME types and how to unpack the information in any 36 * markup or formatting they contain into GATE annotations. Each MIME 37 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 38 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 39 * with a static index residing here when they are constructed. Static 40 * getDocumentFormat methods can then be used to get the appropriate 41 * format class for a particular document. 42 */ 43 public class SgmlDocumentFormat extends TextualDocumentFormat 44 { 45 /** Debug flag */ 46 private static final boolean DEBUG = false; 47 48 /** Default construction */ 49 public SgmlDocumentFormat() { super(); } 50 51 /** Unpack the markup in the document. This converts markup from the 52 * native format (e.g. SGML) into annotations in GATE format. 53 * Uses the markupElementsMap to determine which elements to convert, and 54 * what annotation type names to use. 55 * The doc's content is first converted to a wel formed XML. 56 * If this succeddes then the document is saved into a temp file and parsed 57 * as an XML document. 58 * 59 * @param Document doc The gate document you want to parse. 60 * 61 */ 62 public void unpackMarkup(Document doc) throws DocumentFormatException{ 63 if ( (doc == null) || 64 (doc.getSourceUrl() == null && doc.getContent() == null)){ 65 66 throw new DocumentFormatException( 67 "GATE document is null or no content found. Nothing to parse!"); 68 }// End if 69 // Create a status listener 70 StatusListener statusListener = new StatusListener(){ 71 public void statusChanged(String text){ 72 fireStatusChanged(text); 73 } 74 }; 75 XmlDocumentHandler xmlDocHandler = null; 76 try { 77 Sgml2Xml sgml2Xml = new Sgml2Xml(doc); 78 79 fireStatusChanged("Performing SGML to XML..."); 80 81 // convert the SGML document 82 String xmlUri = sgml2Xml.convert(); 83 84 fireStatusChanged("DONE !"); 85 86 //Out.println("Conversion done..." + xmlUri); 87 //Out.println(sgml2Xml.convert()); 88 // Get a parser factory. 89 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 90 // Set up the factory to create the appropriate type of parser 91 92 // Set up the factory to create the appropriate type of parser 93 // non validating one 94 saxParserFactory.setValidating(false); 95 // non namesapace aware one 96 saxParserFactory.setNamespaceAware(true); 97 98 // Create a SAX parser 99 SAXParser parser = saxParserFactory.newSAXParser(); 100 101 // use it 102 if (null != doc){ 103 // create a new Xml document handler 104 xmlDocHandler = new XmlDocumentHandler(doc, 105 this.markupElementsMap, 106 this.element2StringMap); 107 108 // register a status listener with it 109 xmlDocHandler.addStatusListener(statusListener); 110 111 parser.parse(xmlUri, xmlDocHandler); 112 ((DocumentImpl) doc).setNextAnnotationId( 113 xmlDocHandler.getCustomObjectsId()); 114 }// end if 115 } catch (ParserConfigurationException e){ 116 throw 117 new DocumentFormatException("XML parser configuration exception ", e); 118 } catch (SAXException e){ 119 throw new DocumentFormatException(e); 120 } catch (IOException e){ 121 throw new DocumentFormatException("I/O exception for " + 122 doc.getSourceUrl().toString()); 123 }finally{ 124 if (xmlDocHandler != null) 125 xmlDocHandler.removeStatusListener(statusListener); 126 }// End try 127 128 }// unpackMarkup 129 130 /** This method converts the document's content from SGML 2 XML.*/ 131 private String sgml2Xml(Document doc) { 132 String xmlUri = doc.getSourceUrl().toString (); 133 134 return xmlUri; 135 }// sgml2Xml() 136 137 /** Initialise this resource, and return it. */ 138 public Resource init() throws ResourceInstantiationException{ 139 // Register SGML mime type 140 MimeType mime = new MimeType("text","sgml"); 141 // Register the class handler for this mime type 142 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 143 this); 144 // Register the mime type with mine string 145 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 146 // Register file sufixes for this mime type 147 suffixes2mimeTypeMap.put("sgm",mime); 148 suffixes2mimeTypeMap.put("sgml",mime); 149 setMimeType(mime); 150 return this; 151 }// init 152 153 }//class SgmlDocumentFormat 154
|
SgmlDocumentFormat |
|