|
XmlDocumentFormat |
|
1 /* 2 * XmlDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: XmlDocumentFormat.java,v 1.43 2003/01/29 16:42:44 valyt Exp $ 14 */ 15 16 package gate.corpora; 17 18 //import com.sun.xml.parser.* ; 19 import java.util.*; 20 import java.io.*; 21 import java.net.*; 22 23 import gate.util.*; 24 import gate.*; 25 import gate.xml.*; 26 import gate.event.*; 27 import gate.creole.*; 28 29 // xml tools 30 import javax.xml.parsers.*; 31 import org.xml.sax.*; 32 import org.xml.sax.helpers.*; 33 import org.w3c.www.mime.*; 34 35 /** The format of Documents. Subclasses of DocumentFormat know about 36 * particular MIME types and how to unpack the information in any 37 * markup or formatting they contain into GATE annotations. Each MIME 38 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 39 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 40 * with a static index residing here when they are constructed. Static 41 * getDocumentFormat methods can then be used to get the appropriate 42 * format class for a particular document. 43 */ 44 public class XmlDocumentFormat extends TextualDocumentFormat 45 { 46 /** Debug flag */ 47 private static final boolean DEBUG = false; 48 49 /** Default construction */ 50 public XmlDocumentFormat() { super(); } 51 52 /** We could collect repositioning information during XML parsing */ 53 public Boolean supportsRepositioning() { 54 return new Boolean(true); 55 } // supportsRepositioning 56 57 /** Old style of unpackMarkup (without collecting of RepositioningInfo) */ 58 public void unpackMarkup(Document doc) throws DocumentFormatException { 59 unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null); 60 } // unpackMarkup 61 62 63 /** Unpack the markup in the document. This converts markup from the 64 * native format (e.g. XML) into annotations in GATE format. 65 * Uses the markupElementsMap to determine which elements to convert, and 66 * what annotation type names to use. If the document was created from a 67 * String, then is recomandable to set the doc's sourceUrl to <b>null</b>. 68 * So, if the document has a valid URL, then the parser will try to 69 * parse the XML document pointed by the URL.If the URL is not valid, or 70 * is null, then the doc's content will be parsed. If the doc's content is 71 * not a valid XML then the parser might crash. 72 * 73 * @param Document doc The gate document you want to parse. If 74 * <code>doc.getSourceUrl()</code> returns <b>null</b> then the content of 75 * doc will be parsed. Using a URL is recomended because the parser will 76 * report errors corectlly if the XML document is not well formed. 77 */ 78 public void unpackMarkup(Document doc, RepositioningInfo repInfo, 79 RepositioningInfo ampCodingInfo) throws DocumentFormatException { 80 if( (doc == null) || 81 (doc.getSourceUrl() == null && doc.getContent() == null)){ 82 83 throw new DocumentFormatException( 84 "GATE document is null or no content found. Nothing to parse!"); 85 }// End if 86 87 boolean docHasContentButNoValidURL = false; 88 // This is a test to see if the GATE document has a valid URL or a valid 89 // content. If doesn't has a valid URL then try to parse its content as XML 90 try{ 91 if (doc.getSourceUrl() == null && doc.getContent() != null){ 92 // The doc's url is null but there is a content. 93 docHasContentButNoValidURL = true; 94 }else {URLConnection conn = doc.getSourceUrl().openConnection();} 95 }catch (IOException ex1){ 96 // The URL is not null but is not valid. 97 if(doc.getContent() == null) 98 // The document content is also null. There is nothing we can do. 99 throw new DocumentFormatException("The document doesn't have a" + 100 " valid URL and also no content"); 101 docHasContentButNoValidURL = true; 102 }// End try 103 104 // Create a status listener 105 StatusListener statusListener = new StatusListener(){ 106 public void statusChanged(String text){ 107 // This is implemented in DocumentFormat.java and inherited here 108 fireStatusChanged(text); 109 } 110 }; 111 GateFormatXmlDocumentHandler gateXmlHandler = null; 112 XmlDocumentHandler xmlDocHandler = null; 113 if (docHasContentButNoValidURL) 114 parseDocumentWithoutURL(doc, repInfo, ampCodingInfo); 115 else try { 116 // use Excerces XML parser with JAXP 117 // System.setProperty("javax.xml.parsers.SAXParserFactory", 118 // "org.apache.xerces.jaxp.SAXParserFactoryImpl"); 119 // Get a parser factory. 120 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 121 // Set up the factory to create the appropriate type of parser 122 // non validating one 123 saxParserFactory.setValidating(false); 124 // non namesapace aware one 125 saxParserFactory.setNamespaceAware(true); 126 // create it 127 SAXParser xmlParser = saxParserFactory.newSAXParser(); 128 if (isGateXmlDocument){ 129 // Construct the appropiate xml handler for the job. 130 gateXmlHandler = new GateFormatXmlDocumentHandler(doc); 131 // Register a status listener 132 gateXmlHandler.addStatusListener(statusListener); 133 // Parse the Gate Document 134 xmlParser.parse(doc.getSourceUrl().toString(), gateXmlHandler); 135 gateXmlHandler.removeStatusListener(statusListener); 136 }else{ 137 // Create a new Xml document handler 138 xmlDocHandler = new XmlDocumentHandler( doc, 139 this.markupElementsMap, 140 this.element2StringMap); 141 // Register a status listener with it 142 xmlDocHandler.addStatusListener(statusListener); 143 // set repositioning object 144 xmlDocHandler.setRepositioningInfo(repInfo); 145 // set the object with ampersand coding positions 146 xmlDocHandler.setAmpCodingInfo(ampCodingInfo); 147 148 // Parse the document handler 149 /* Angel 150 xmlParser.parse(doc.getSourceUrl().toString(), xmlDocHandler ); 151 Angel */ 152 // try to choose concret parser (Xerces) 153 // Angel - start 154 org.apache.xerces.parsers.SAXParser newxmlParser = 155 new org.apache.xerces.parsers.SAXParser(); 156 // Set up the factory to create the appropriate type of parser 157 // non validating one 158 // http://xml.org/sax/features/validation set to false 159 newxmlParser.setFeature("http://xml.org/sax/features/validation", false); 160 // namesapace aware one 161 // http://xml.org/sax/features/namespaces set to true 162 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true); 163 newxmlParser.setContentHandler(xmlDocHandler); 164 newxmlParser.setErrorHandler(xmlDocHandler); 165 newxmlParser.setDTDHandler(xmlDocHandler); 166 newxmlParser.setEntityResolver(xmlDocHandler); 167 newxmlParser.setReaderFactory(new StreamingCharFactory()); 168 newxmlParser.parse(doc.getSourceUrl().toString()); 169 // Angel - end 170 ((DocumentImpl) doc).setNextAnnotationId( 171 xmlDocHandler.getCustomObjectsId()); 172 xmlDocHandler.removeStatusListener(statusListener); 173 }// End if 174 } catch (ParserConfigurationException e){ 175 throw 176 new DocumentFormatException("XML parser configuration exception ", e); 177 } catch (SAXException e){ 178 doc.getFeatures().put("parsingError", new Boolean(true)); 179 180 Boolean bThrow = (Boolean) 181 doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME); 182 183 if(bThrow != null && bThrow.booleanValue()) { 184 // the next line is commented to avoid Document creation fail on error 185 throw new DocumentFormatException(e); 186 } 187 else { 188 Out.println("Warning: Document remains unparsed. \n" 189 +"\n Stack Dump: "); 190 e.printStackTrace(Out.getPrintWriter()); 191 } // if 192 193 } catch (IOException e){ 194 throw new DocumentFormatException("I/O exception for " + 195 doc.getSourceUrl().toString()); 196 }finally{ 197 if(gateXmlHandler != null) 198 gateXmlHandler.removeStatusListener(statusListener); 199 if (xmlDocHandler != null) 200 xmlDocHandler.removeStatusListener(statusListener); 201 }// End if else try 202 }// unpackMarkup 203 204 /** Called from unpackMarkup() if the document have been created from a 205 * string 206 */ 207 private void parseDocumentWithoutURL(gate.Document aDocument, 208 RepositioningInfo repInfo, 209 RepositioningInfo ampCodingInfo) 210 throws DocumentFormatException { 211 212 XmlDocumentHandler xmlDocHandler = null; 213 // Create a status listener 214 StatusListener statusList = new StatusListener(){ 215 public void statusChanged(String text){ 216 // this is implemented in DocumentFormat.java and inherited here 217 fireStatusChanged(text); 218 } 219 }; 220 try{ 221 Reader reader = new StringReader(aDocument.getContent().toString()); 222 // 223 // 224 // new InputStreamReader( 225 // new ByteArrayInputStream(aDocument.getContent().toString().getBytes("UTF-8")), 226 // "UTF-8"); 227 InputSource is = new InputSource(reader); 228 229 230 // use Excerces XML parser with JAXP 231 // System.setProperty("javax.xml.parsers.SAXParserFactory", 232 // "org.apache.xerces.jaxp.SAXParserFactoryImpl"); 233 // Get a parser factory. 234 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 235 // Set up the factory to create the appropriate type of parser 236 // non validating one 237 saxParserFactory.setValidating(false); 238 // non namesapace aware one 239 saxParserFactory.setNamespaceAware(true); 240 // create it 241 SAXParser xmlParser = saxParserFactory.newSAXParser(); 242 243 // create a new Xml document handler 244 xmlDocHandler = new XmlDocumentHandler(aDocument, 245 this.markupElementsMap, 246 this.element2StringMap); 247 // Regsiter the statusListener with xmlDocHandler 248 xmlDocHandler.addStatusListener(statusList); 249 // set repositioning object 250 xmlDocHandler.setRepositioningInfo(repInfo); 251 // set the object with ampersand coding positions 252 xmlDocHandler.setAmpCodingInfo(ampCodingInfo); 253 // Parse the document handler 254 /* Angel 255 // xmlParser.parse(is, xmlDocHandler); 256 Angel */ 257 258 // Angel - start 259 // try to choose concret parser 260 org.apache.xerces.parsers.SAXParser newxmlParser = 261 new org.apache.xerces.parsers.SAXParser(); 262 // Set up the factory to create the appropriate type of parser 263 // non validating one 264 // http://xml.org/sax/features/validation set to false 265 newxmlParser.setFeature("http://xml.org/sax/features/validation", false); 266 // namesapace aware one 267 // http://xml.org/sax/features/namespaces set to true 268 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true); 269 newxmlParser.setContentHandler(xmlDocHandler); 270 newxmlParser.setErrorHandler(xmlDocHandler); 271 newxmlParser.setDTDHandler(xmlDocHandler); 272 newxmlParser.setEntityResolver(xmlDocHandler); 273 newxmlParser.setReaderFactory(new StreamingCharFactory()); 274 newxmlParser.parse(is); 275 // Angel - end 276 277 ((DocumentImpl) aDocument).setNextAnnotationId( 278 xmlDocHandler.getCustomObjectsId()); 279 } catch (ParserConfigurationException e){ 280 throw new DocumentFormatException( 281 "XML parser configuration exception ", e); 282 } catch (SAXException e){ 283 throw new DocumentFormatException(e); 284 } catch (IOException e){ 285 throw new DocumentFormatException(e); 286 }finally{ 287 // Remove the statusListener with xmlDocHandler 288 xmlDocHandler.removeStatusListener(statusList); 289 }// End try 290 }// End parseDocumentWithoutURL() 291 292 /** Initialise this resource, and return it. */ 293 public Resource init() throws ResourceInstantiationException{ 294 // Register XML mime type 295 MimeType mime = new MimeType("text","xml"); 296 // Register the class handler for this mime type 297 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 298 this); 299 // Register the mime type with mine string 300 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 301 //sometimes XML file appear as application/xml 302 mimeString2mimeTypeMap.put("application/xml", mime); 303 // Register file sufixes for this mime type 304 suffixes2mimeTypeMap.put("xml",mime); 305 suffixes2mimeTypeMap.put("xhtm",mime); 306 suffixes2mimeTypeMap.put("xhtml",mime); 307 // Register magic numbers for this mime type 308 magic2mimeTypeMap.put("<?xml",mime); 309 // Set the mimeType for this language resource 310 setMimeType(mime); 311 return this; 312 }// init() 313 314 }//class XmlDocumentFormat 315
|
XmlDocumentFormat |
|