package com.digitalpebble.rasp2.parser; import gate.Annotation; import gate.AnnotationSet; import gate.Factory; import gate.FeatureMap; import gate.util.OffsetComparator; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import javax.xml.parsers.DocumentBuilder; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; public class ParserXMLoutputAnalyser { private AnnotationSet outputAS; Iterator sentenceIterator; AnnotationSet wordformAS; ParserXMLoutputAnalyser(AnnotationSet in, AnnotationSet out) { this.outputAS = out; List sents = new ArrayList(in.get("Sentence")); java.util.Collections.sort(sents, new OffsetComparator()); sentenceIterator = sents.iterator(); wordformAS = in.get("WordForm"); } // we have obtained a RASP XML output which we will parse public void parseRASPOutput(Document rasp) throws Exception { NodeList nl = rasp.getElementsByTagName("sentence"); for (int i = 0; i < nl.getLength(); i++) { Element sentenceElement = (Element) nl.item(i); Annotation sentence = (Annotation) sentenceIterator.next(); parseRASPSentence(sentenceElement, sentence); } } /*************************************************************************** * Parse a specific sentence - this is used when the sentences are sent to * the parser one by one **************************************************************************/ public void parseRASPOutputSingleSentence(String input, DocumentBuilder builder) throws Exception { // we get the next sentence in any case - even if this one crashes we want // to be able to process the next one Annotation sentence = (Annotation) sentenceIterator.next(); Document rasp = builder.parse(new InputSource(new StringReader(input))); NodeList nl = rasp.getElementsByTagName("sentence"); Element sentenceElement = (Element) nl.item(0); parseRASPSentence(sentenceElement, sentence); } private void parseRASPSentence(Element sentence, Annotation sentenceAnn) throws Exception { // matches the lemmas returned by the parser with // original annotations in cas HashMap<String, Annotation> mappingIntegerAnnotation = new HashMap<String, Annotation>(); HashMap<String, Annotation> tempmappingUIMAidAnnotation = new HashMap<String, Annotation>(); // for this sentence => get all the wordForms // and put them in the map mappingIntegerAnnotation Iterator allwf = wordformAS.getContained(sentenceAnn.getStartNode().getOffset(), sentenceAnn.getEndNode().getOffset()).iterator(); while (allwf.hasNext()) { Annotation wordForm = (Annotation) allwf.next(); tempmappingUIMAidAnnotation.put(Integer.toString(wordForm .getId()), wordForm); } NodeList nl = sentence.getElementsByTagName("lemma-list"); if (nl.getLength() > 0) { // get all the lemma elements NodeList lemmalist = sentence.getElementsByTagName("lemma"); for (int lemmanum = 0; lemmalist != null && lemmanum < lemmalist.getLength(); lemmanum++) { Element lemma = (Element) lemmalist.item(lemmanum); // the the value of the attributes num and wtag String num = lemma.getAttribute("num"); String wtag = lemma.getAttribute("wtag"); wtag = wtag.substring(7); int pos = wtag.indexOf("\""); wtag = wtag.substring(0, pos); // get the corresponding annotation from the cas Annotation wf = tempmappingUIMAidAnnotation.get(wtag); if (wf == null) throw new Exception("No annotation found for " + lemma.toString()); mappingIntegerAnnotation.put(num, wf); } } // now that we have the mappings between the numbers returned by RASP // and the original annotations we can analyse the information about the // dependencies NodeList grl = sentence.getElementsByTagName("gr-list"); if (grl.getLength() > 0) { // get all the gr elements elements NodeList grlist = sentence.getElementsByTagName("gr"); for (int grnum = 0; grlist != null && grnum < grlist.getLength(); grnum++) { Element gr = (Element) grlist.item(grnum); String grtype = gr.getAttribute("type"); String grsubtype = gr.getAttribute("subtype"); // TODO fix problem in RASP instead of patching output if (grsubtype.startsWith("<w id=")) grsubtype = ""; String head = gr.getAttribute("head"); String dep = gr.getAttribute("dep"); Annotation wfhead = mappingIntegerAnnotation.get(head); Annotation wfdep = mappingIntegerAnnotation.get(dep); if (wfdep == null && wfhead == null) throw new Exception("No head and no dep for this relation"); // some relations such as 'passive' don't have a dep if (wfdep == null) { FeatureMap fm = Factory.newFeatureMap(); fm.put("head", wfhead); fm.put("type", grtype); fm.put("subtype", grsubtype); outputAS.add(wfhead.getStartNode().getOffset(), wfhead.getEndNode().getOffset(), "Dependency", fm); continue; } // other relations such as 'ta' don't have a head if (wfhead == null) { FeatureMap fm = Factory.newFeatureMap(); fm.put("dep", wfdep); fm.put("type", grtype); fm.put("subtype", grsubtype); outputAS.add(wfdep.getStartNode().getOffset(), wfdep.getEndNode().getOffset(), "Dependency", fm); continue; } // create a new annotation of type Dependency // and add it to the cas Long startDependency = wfhead.getStartNode().getOffset(); Long endDependency = wfdep.getEndNode().getOffset(); if (wfdep.getStartNode().getOffset() < startDependency) { startDependency = wfdep.getStartNode().getOffset(); endDependency = wfhead.getEndNode().getOffset(); } FeatureMap fm = Factory.newFeatureMap(); fm.put("dep", wfdep); fm.put("head", wfhead); fm.put("type", grtype); fm.put("subtype", grsubtype); outputAS.add(startDependency, endDependency, "Dependency", fm); } } } }