1
15
16 package gate.corpora;
17
18 import java.io.*;
20 import java.net.URLConnection;
21
22 import javax.xml.parsers.*;
23
24 import org.xml.sax.InputSource;
25 import org.xml.sax.SAXException;
26
27 import gate.*;
28 import gate.creole.ResourceInstantiationException;
29 import gate.event.StatusListener;
30 import gate.util.DocumentFormatException;
31 import gate.util.Out;
32 import gate.xml.*;
33
35
44 public class XmlDocumentFormat extends TextualDocumentFormat
45 {
46
47 private static final boolean DEBUG = false;
48
49
50 public XmlDocumentFormat() { super(); }
51
52
53 public Boolean supportsRepositioning() {
54 return new Boolean(true);
55 }
57
58 public void unpackMarkup(Document doc) throws DocumentFormatException {
59 unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null);
60 }
62
63
78 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
79 RepositioningInfo ampCodingInfo) throws DocumentFormatException {
80 if( (doc == null) ||
81 (doc.getSourceUrl() == null && doc.getContent() == null)){
82
83 throw new DocumentFormatException(
84 "GATE document is null or no content found. Nothing to parse!");
85 }
87 boolean docHasContentButNoValidURL = false;
88 try{
91 if (doc.getSourceUrl() == null && doc.getContent() != null){
92 docHasContentButNoValidURL = true;
94 }else {URLConnection conn = doc.getSourceUrl().openConnection();}
95 }catch (IOException ex1){
96 if(doc.getContent() == null)
98 throw new DocumentFormatException("The document doesn't have a" +
100 " valid URL and also no content");
101 docHasContentButNoValidURL = true;
102 }
104 StatusListener statusListener = new StatusListener(){
106 public void statusChanged(String text){
107 fireStatusChanged(text);
109 }
110 };
111 GateFormatXmlDocumentHandler gateXmlHandler = null;
112 XmlDocumentHandler xmlDocHandler = null;
113 if (docHasContentButNoValidURL)
114 parseDocumentWithoutURL(doc, repInfo, ampCodingInfo);
115 else try {
116 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
121 saxParserFactory.setValidating(false);
124 saxParserFactory.setNamespaceAware(true);
126 SAXParser xmlParser = saxParserFactory.newSAXParser();
128 if (isGateXmlDocument){
129 gateXmlHandler = new GateFormatXmlDocumentHandler(doc);
131 gateXmlHandler.addStatusListener(statusListener);
133 xmlParser.parse(doc.getSourceUrl().toString(), gateXmlHandler);
135 gateXmlHandler.removeStatusListener(statusListener);
136 }else{
137 xmlDocHandler = new XmlDocumentHandler( doc,
139 this.markupElementsMap,
140 this.element2StringMap);
141 xmlDocHandler.addStatusListener(statusListener);
143 xmlDocHandler.setRepositioningInfo(repInfo);
145 xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
147
148
152
155 org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
156 newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
162 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
165 newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
166 newxmlParser.setContentHandler(xmlDocHandler);
167 newxmlParser.setErrorHandler(xmlDocHandler);
168 newxmlParser.setDTDHandler(xmlDocHandler);
169 newxmlParser.setEntityResolver(xmlDocHandler);
170 newxmlParser.parse(doc.getSourceUrl().toString());
171 ((DocumentImpl) doc).setNextAnnotationId(
173 xmlDocHandler.getCustomObjectsId());
174 xmlDocHandler.removeStatusListener(statusListener);
175 } } catch (ParserConfigurationException e){
177 throw
178 new DocumentFormatException("XML parser configuration exception ", e);
179 } catch (SAXException e){
180 doc.getFeatures().put("parsingError", new Boolean(true));
181
182 Boolean bThrow = (Boolean)
183 doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
184
185 if(bThrow != null && bThrow.booleanValue()) {
186 throw new DocumentFormatException(e);
188 }
189 else {
190 Out.println("Warning: Document remains unparsed. \n"
191 +"\n Stack Dump: ");
192 e.printStackTrace(Out.getPrintWriter());
193 }
195 } catch (IOException e){
196 throw new DocumentFormatException("I/O exception for " +
197 doc.getSourceUrl().toString());
198 }finally{
199 if(gateXmlHandler != null)
200 gateXmlHandler.removeStatusListener(statusListener);
201 if (xmlDocHandler != null)
202 xmlDocHandler.removeStatusListener(statusListener);
203 } }
206
209 private void parseDocumentWithoutURL(gate.Document aDocument,
210 RepositioningInfo repInfo,
211 RepositioningInfo ampCodingInfo)
212 throws DocumentFormatException {
213
214 XmlDocumentHandler xmlDocHandler = null;
215 StatusListener statusList = new StatusListener(){
217 public void statusChanged(String text){
218 fireStatusChanged(text);
220 }
221 };
222 try{
223 Reader reader = new StringReader(aDocument.getContent().toString());
224 InputSource is = new InputSource(reader);
230
231
232 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
237 saxParserFactory.setValidating(false);
240 saxParserFactory.setNamespaceAware(true);
242 SAXParser xmlParser = saxParserFactory.newSAXParser();
244
245 xmlDocHandler = new XmlDocumentHandler(aDocument,
247 this.markupElementsMap,
248 this.element2StringMap);
249 xmlDocHandler.addStatusListener(statusList);
251 xmlDocHandler.setRepositioningInfo(repInfo);
253 xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
255
259
260 org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
263 newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
269 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
272 newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
273 newxmlParser.setContentHandler(xmlDocHandler);
274 newxmlParser.setErrorHandler(xmlDocHandler);
275 newxmlParser.setDTDHandler(xmlDocHandler);
276 newxmlParser.setEntityResolver(xmlDocHandler);
277 newxmlParser.parse(is);
278
280 ((DocumentImpl) aDocument).setNextAnnotationId(
281 xmlDocHandler.getCustomObjectsId());
282 } catch (ParserConfigurationException e){
283 throw new DocumentFormatException(
284 "XML parser configuration exception ", e);
285 } catch (SAXException e){
286 throw new DocumentFormatException(e);
287 } catch (IOException e){
288 throw new DocumentFormatException(e);
289 }finally{
290 xmlDocHandler.removeStatusListener(statusList);
292 } }
295
296 public Resource init() throws ResourceInstantiationException{
297 MimeType mime = new MimeType("text","xml");
299 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
301 this);
302 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
304 mimeString2mimeTypeMap.put("application/xml", mime);
306 suffixes2mimeTypeMap.put("xml",mime);
308 suffixes2mimeTypeMap.put("xhtm",mime);
309 suffixes2mimeTypeMap.put("xhtml",mime);
310 magic2mimeTypeMap.put("<?xml",mime);
312 setMimeType(mime);
314 return this;
315 }
317 }