package com.digitalpebble.rasp2.parser; import gate.Annotation; import gate.AnnotationSet; import gate.FeatureMap; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.util.InvalidOffsetException; import gate.util.OffsetComparator; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.StringReader; import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.xml.sax.InputSource; import com.digitalpebble.util.StreamReader; import com.digitalpebble.util.StreamWriter; import com.digitalpebble.util.Utilities; /** * Wrapper for the RASP Parser, which generates dependencies between Tokens but * also a syntactic analysis. This annotator requires a preliminary analysis * with the POS tagger and morphological analyser */ public class ParserAnnotator extends AbstractLanguageAnalyser { // static String[] outputparamvalues = new String[] { "-oa", "-ot", "-og", // "-otg", "-ogio", "-ogw", "-otgio" }; static String[] outputparamvalues = new String[] {"-og","-ogio", "-ogw" }; // parameters specified via the GUI // one of the values specified above private String outputFormat; private Integer parseNum; private Integer time; private Boolean subcategorisation; private Boolean phrasalVerbs; private String inputASName; private String outputASName; private String charset = "ISO-8859-1"; private boolean debug = false; private URL raspHome = null; // built from the value of rasphome private String parserScript; DocumentBuilder builder; /** * Send each sentence to the parser instead of the entire document Can be * slower but is also safer and avoids memory issues. -1 indicates that the * entire document has to be sent in one batch, 1 or more indicates that n * sentences have to be sent TODO add an option to modify that */ private Integer sentenceBatch = new Integer(-1); public Resource init() throws ResourceInstantiationException { if (getRaspHome() == null) { throw new ResourceInstantiationException(new Exception( "location of rasp not set")); } parserScript = getRaspHome().getFile() + File.separator + "scripts" + File.separator + "rasp_parse.sh"; // check that the file exists File scriptfile = new File(parserScript); if (scriptfile.exists() == false) throw new ResourceInstantiationException(new Exception("Script " + scriptfile.getAbsolutePath() + " does not exist")); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); try { builder = factory.newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new ResourceInstantiationException(e); } return super.init(); } public void execute() throws ExecutionException { AnnotationSet inputAS = (inputASName == null || inputASName.equals("")) ? document .getAnnotations() : document.getAnnotations(inputASName); AnnotationSet outputAS = (outputASName == null || outputASName.equals("")) ? document .getAnnotations() : document.getAnnotations(outputASName); boolean noWordForms = inputAS.get("WordForm").isEmpty(); if (noWordForms){ System.err.println("RASP Parser needs annotations of type WordForm - skipping document"); } String encoding = getCharset(); if (encoding != null && Charset.isSupported(encoding)) charset = encoding; // check that the output format is in the list boolean inList = java.util.Arrays.asList(outputparamvalues).contains( getOutputFormat()); if (inList == false) { // unknown format -> ignored throw new ExecutionException("Value of output param not allowed [" + getOutputFormat() + "]"); } // do we call the parser for each sentence // or the whole document if (sentenceBatch == 1) { processBySentence(inputAS,outputAS); } else processAll(inputAS,outputAS); } public void processAll(AnnotationSet input, AnnotationSet output) throws ExecutionException { File tempForms; try { tempForms = java.io.File.createTempFile("rasp", ".data"); this.generateInputForParser(input, tempForms, -1); } catch (Exception e) { throw new ExecutionException(e); } // the next step consists in calling the morpho scripts // and modify the annotations in the CAS accordingly ParserXMLoutputAnalyser parser = new ParserXMLoutputAnalyser(input,output); callExternalCommand(parser, tempForms, false); if (!debug) tempForms.delete(); } public void processBySentence(AnnotationSet input, AnnotationSet output) throws ExecutionException { ParserXMLoutputAnalyser parser = new ParserXMLoutputAnalyser(input,output); List sents = new ArrayList(input.get("Sentence")); java.util.Collections.sort(sents, new OffsetComparator()); Iterator sentenceIterator = sents.iterator(); int sentNum = 0; while (sentenceIterator.hasNext()) { sentNum++; sentenceIterator.next(); File tempForms; try { tempForms = java.io.File.createTempFile("rasp", ".data_" + sentNum); this.generateInputForParser(input, tempForms, sentNum); } catch (Exception e) { throw new ExecutionException(e); } // the next step consists in calling the morpho scripts // and modify the annotations in the CAS accordingly callExternalCommand(parser, tempForms, true); if (!debug) tempForms.delete(); } } // We want to generate things like // ^ ^_^:1 // This This_DD1 &rasp_colon;1 // is is_VBZ &rasp_colon;1 // a a_ZZ1 &rasp_colon;4.96223e-05 a_II &rasp_colon;0.000225492 a_AT1 // &rasp_colon;0.999725 // test test_NN1 &rasp_colon;0.994738 test_VV0 &rasp_colon;0.00526216 // @returns a list of Token annotations // @param sentenceNumber = number of the sentence to generate; -1 to // generate them all private List<Annotation> generateInputForParser(AnnotationSet inputAS, File outputFile, int sentenceNumber) throws IOException, InvalidOffsetException { OutputStream fout = new FileOutputStream(outputFile); OutputStreamWriter out = new OutputStreamWriter(fout, charset); BufferedWriter writer = new BufferedWriter(out); // for each sentence dump the information about the WordForms for each // token List sents = new ArrayList(inputAS.get("Sentence")); java.util.Collections.sort(sents, new OffsetComparator()); Iterator sentenceIterator = sents.iterator(); AnnotationSet wfs = inputAS.get("WordForm"); List<Annotation> rewordforms = new ArrayList<Annotation>(wfs.size()); // AnnotationIndex wordformAnnotationIndex = // cas.getAnnotationIndex(WordForm.type); // FSIterator wordFormIterator = wordformAnnotationIndex.iterator(); int sentNum = 0; while (sentenceIterator.hasNext()) { sentNum++; Annotation sentence = (Annotation) sentenceIterator.next(); // check that this is the one we want if (sentenceNumber != -1) { if (sentNum < sentenceNumber) { continue; } if (sentNum > sentenceNumber) { break; } } writer.append("^ ^_^:1\n"); // get the Tokens (or word forms?) located under that sentence ArrayList wordForms = new ArrayList(wfs.getContained(sentence .getStartNode().getOffset(), sentence.getEndNode() .getOffset())); java.util.Collections.sort(wordForms, new OffsetComparator()); Iterator<Annotation> iter = wordForms.iterator(); // create a single entry for word forms located at the same position Long previousstart = null; Long previousend = null; boolean isFirst = true; while (iter.hasNext()) { Annotation a = iter.next(); FeatureMap fm = a.getFeatures(); String form = (String) fm.get("string"); String pos = (String) fm.get("pos"); Double prob = (Double) fm.get("probability"); String lemma = (String) fm.get("lemma"); String suffix = (String) fm.get("suffix"); Long laststartoffset = a.getStartNode().getOffset(); Long lastendoffset = a.getEndNode().getOffset(); // do we have a new entity? if (laststartoffset != previousstart || lastendoffset != previousend) { // finish the line if (isFirst == false) { writer.newLine(); } isFirst = false; // dump the form as found in the text String formToken = getDocument().getContent().getContent( laststartoffset, lastendoffset).toString(); writer.append(formToken); } // add the rest anyway writer.append(" "); writer.append("<w id=\"" + a.getId()).append("\">"); writer.append(lemma); writer.append(suffix).append("_").append(pos); writer.append("</w>"); writer.append(":" + Double.toString(prob)); rewordforms.add(a); previousstart = laststartoffset; previousend = lastendoffset; } writer.newLine(); } writer.flush(); writer.close(); return rewordforms; } private void callExternalCommand(ParserXMLoutputAnalyser parser, File inputFile, boolean perSentence) { File RASPexec = new File(this.parserScript); List params = new ArrayList(); if (time!=null){ params.add( "-t"); params.add(this.time.toString()); } if (this.parseNum!=null){ params.add( "-n"); params.add(this.parseNum.toString()); } params.add(outputFormat); // build the command line String[] parameters = new String[params.size()]; parameters = (String[]) params.toArray(parameters); int lengthParams = parameters.length; // subcategorisation if (subcategorisation.booleanValue() == true) lengthParams++; // no phrasal verbs? if (phrasalVerbs.booleanValue() == false) lengthParams++; // need to add the executable + specify XML format String[] cmdline = new String[lengthParams + 2]; System.arraycopy(parameters, 0, cmdline, 1, parameters.length); cmdline[0] = RASPexec.getAbsolutePath(); if (subcategorisation.booleanValue() == true) cmdline[parameters.length + 1] = "-s"; if (phrasalVerbs.booleanValue() == false) cmdline[parameters.length + 2] = "-x"; cmdline[cmdline.length - 1] = "-y"; // run the lemmer and convert output into annotations Process process = null; Thread sw = null; Thread srt = null; try { process = Runtime.getRuntime().exec(cmdline); // pass the content of the file to the buffer of the process // in a different Thread FileInputStream tempin = new FileInputStream(inputFile); sw = new Thread(new StreamWriter(tempin, process.getOutputStream())); StreamReader sr = new StreamReader(process.getInputStream(), this.charset); sw.start(); // read the information returned by the application // in a different Thread srt = new Thread(sr); srt.start(); // wait for the process to finish process.waitFor(); // need to wait for the reader to get everything else sw.join(); srt.join(); // dump the content of the buffer into a file (for debug) if (debug) dumpToFile(sr.getBuffer()); // get the buffer from the StreamReader String xmloutput = sr.getBuffer().toString(); // updates the cas with the information contained in the XML if (!perSentence) { Document domDoc = this.builder.parse(new InputSource( new StringReader(xmloutput))); parser.parseRASPOutput(domDoc); } else { parser.parseRASPOutputSingleSentence(xmloutput, builder); } } catch (Exception err) { // System.out.println("Problem when calling the executable"); // throw new AnalysisEngineProcessException(err); // Just log the exception message and continue with the annotation // of the documents String message = "Exception thrown in ParserAnnotator: " + err.getMessage(); System.err.println(message); // copy the input file under a different name so that // we can trace the problem File errorFile = new File(inputFile.getAbsolutePath() + ".error"); try { errorFile.createNewFile(); Utilities.copyFile(inputFile, errorFile); } catch (IOException e) { } } finally { // destroy the process process.destroy(); try { sw.join(); srt.join(); } catch (InterruptedException e) { } } } private final void dumpToFile(StringBuffer buffer) { File tempForms; try { tempForms = java.io.File.createTempFile("rasp", ".parse"); BufferedWriter writer = new BufferedWriter( new FileWriter(tempForms)); writer.write(buffer.toString()); writer.close(); } catch (IOException e) { e.printStackTrace(); } } public String getCharset() { return charset; } public void setCharset(String charset) { this.charset = charset; } public String getInputASName() { return inputASName; } public void setInputASName(String inputASName) { this.inputASName = inputASName; } public String getOutputFormat() { return outputFormat; } public void setOutputFormat(String outputFormat) { this.outputFormat = outputFormat; } public Integer getParseNum() { return parseNum; } public void setParseNum(Integer parseNum) { this.parseNum = parseNum; } public Boolean getPhrasalVerbs() { return phrasalVerbs; } public void setPhrasalVerbs(Boolean phrasalVerbs) { this.phrasalVerbs = phrasalVerbs; } public URL getRaspHome() { return raspHome; } public void setRaspHome(URL raspHome) { this.raspHome = raspHome; } public Boolean getSubcategorisation() { return subcategorisation; } public void setSubcategorisation(Boolean subcategorisation) { this.subcategorisation = subcategorisation; } public Integer getTime() { return time; } public void setTime(Integer time) { this.time = time; } public void setDebug(Boolean debug) { this.debug = debug; } public Boolean getDebug() { return debug; } public String getOutputASName() { return outputASName; } public void setOutputASName(String outputASName) { this.outputASName = outputASName; } }