package com.digitalpebble.rasp2.morph; import gate.Annotation; import gate.AnnotationSet; import gate.FeatureMap; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.util.InvalidOffsetException; import gate.util.OffsetComparator; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.digitalpebble.util.StreamReader; import com.digitalpebble.util.StreamWriter; /** * Takes a document annotated with Sentences and WordForms gets the * morphological information. WordForms must have : - a String feature - a POS * feature they will get a lemma and a suffix feature * * @author julien */ public class MorphoAnnotator extends AbstractLanguageAnalyser { private String inputASName; private String charset = "ISO-8859-1"; private boolean debug = false; private URL raspHome = null; // these two values are set up dynamically from the rasphome parameter private String lemmatiserExecutable; private String morpherRoot; public Resource init() throws ResourceInstantiationException { if (getRaspHome() == null) { throw new ResourceInstantiationException(new Exception( "location of rasp not set")); } morpherRoot = raspHome.getFile() + File.separator + "morph"; lemmatiserExecutable = "morpha." + com.digitalpebble.util.Utilities.getArch(); // check that the file exists File scriptfile = new File(morpherRoot, lemmatiserExecutable); if (scriptfile.exists() == false) throw new ResourceInstantiationException(new Exception("Script " + scriptfile.getAbsolutePath() + " does not exist")); return super.init(); } public void execute() throws ExecutionException { AnnotationSet inputAS = (inputASName == null || inputASName.equals("")) ? document .getAnnotations() : document.getAnnotations(inputASName); String encoding = getCharset(); if (encoding != null && Charset.isSupported(encoding)) charset = encoding; // at this stage we know we need to analyse // Tokens containing one or more POS tags // the first step consists in generating a text representation // of the input similar to content of RASP .forms files // (which is what the morpher takes as input and not the .tags) File tempForms; List wordFormList = null; try { tempForms = java.io.File.createTempFile("rasp", ".forms"); tempForms.deleteOnExit(); wordFormList = this.generateInputForMorpher(inputAS, tempForms); } catch (Exception e) { throw new ExecutionException(e); } // if no word forms then simply skip that step if (wordFormList.size()==0){ throw new ExecutionException("No annotations of WordForms found"); } // the next step consists in calling the morpho scripts // and modify the annotations in the CAS accordingly callExternalCommand(tempForms, wordFormList); // delete the input file if not in debug mode if (!debug) tempForms.delete(); } // We want to generate things like // ^ ^_^:1 // This This_DD1 &rasp_colon;1 // is is_VBZ &rasp_colon;1 // a a_ZZ1 &rasp_colon;4.96223e-05 a_II &rasp_colon;0.000225492 a_AT1 // &rasp_colon;0.999725 // test test_NN1 &rasp_colon;0.994738 test_VV0 &rasp_colon;0.00526216 // @returns a list of Token annotations private List<Annotation> generateInputForMorpher(AnnotationSet inputAS, File outputFile) throws Exception { OutputStream fout = new FileOutputStream(outputFile); OutputStreamWriter out = new OutputStreamWriter(fout, charset); BufferedWriter writer = new BufferedWriter(out); Iterator sentences = inputAS.get("Sentence").iterator(); AnnotationSet wfs = inputAS.get("WordForm"); List<Annotation> wordforms = new ArrayList<Annotation>(wfs.size()); // * We generate things like a a_AT1 &rasp_colon;0.999748 a_ZZ1 // * &rasp_colon;2.77533e-05 a_II &rasp_colon;0.000223815 while (sentences.hasNext()) { writer.append("^ ^_^:1\n"); Annotation sentence = (Annotation) sentences.next(); AnnotationSet wfinsentence = wfs.getContained(sentence .getStartNode().getOffset(), sentence.getEndNode() .getOffset()); // sort them List<Annotation> sortedWordForms = new ArrayList<Annotation>( wfinsentence); java.util.Collections.sort(sortedWordForms, new OffsetComparator()); Iterator<Annotation> iter = sortedWordForms.iterator(); // create a single entry for word forms located at the same position Long previousstart = null; Long previousend = null; boolean isFirst = true; while (iter.hasNext()) { Annotation a = iter.next(); FeatureMap fm = a.getFeatures(); String form = (String) fm.get("string"); String pos = (String) fm.get("pos"); Double prob = (Double) fm.get("probability"); Long laststartoffset = a.getStartNode().getOffset(); Long lastendoffset = a.getEndNode().getOffset(); // do we have a new entity? if (laststartoffset != previousstart || lastendoffset != previousend) { // finish the line if (isFirst == false) { writer.newLine(); } isFirst = false; // dump the form as found in the text String formToken = getDocument().getContent().getContent( laststartoffset, lastendoffset).toString(); writer.append(formToken); } // add the rest anyway writer.append(" ").append(form).append("_"); writer.append(pos).append(" &rasp_colon;"); writer.append(Double.toString(prob)); wordforms.add(a); previousstart = laststartoffset; previousend = lastendoffset; } writer.newLine(); } writer.flush(); writer.close(); return wordforms; } private void callExternalCommand(File tempToken, List tokenList) throws ExecutionException { File lemmRoot = new File(this.morpherRoot); File lemmer = new File(lemmRoot, this.lemmatiserExecutable); // parameters : -actf /usr/local/bin/RASP/verbstem.list String[] cmdline = new String[3]; cmdline[0] = lemmer.getAbsolutePath(); cmdline[1] = "-actf"; cmdline[2] = "verbstem.list"; // run the lemmer and convert output into annotations Process process = null; Thread sw = null; Thread srt = null; try { process = Runtime.getRuntime().exec(cmdline, null, lemmRoot); // pass the content of the file to the buffer of the process // in a different Thread FileInputStream tempin = new FileInputStream(tempToken); sw = new Thread(new StreamWriter(tempin, process.getOutputStream())); StreamReader sr = new StreamReader(process.getInputStream()); sw.start(); // read the information returned by the application // in a different Thread srt = new Thread(sr); srt.start(); // wait for the process to finish process.waitFor(); // need to wait for the reader to get everything else sw.join(); srt.join(); // dump the content of the buffer into a file (for debug) if (debug) dumpToFile(sr.getBuffer()); // get the buffer from the StreamReader processTokens(sr.getBuffer().toString(), tokenList); } catch (Exception err) { System.out.println("Problem when calling the executable"); throw new ExecutionException(err); } finally { // destroy the process process.destroy(); try { sw.join(); srt.join(); } catch (InterruptedException e) { } } } private void processTokens(String buffer, List wordForms) throws ExecutionException, InvalidOffsetException { String[] lines = buffer.split("\n"); Iterator<Annotation> iter = wordForms.iterator(); Pattern splitpattern = Pattern .compile(" (.+?)_(.+?) &rasp_colon;(.+?)"); String line = null; // read output from lemmer for (int i = 0; i < lines.length; i++) { line = lines[i]; // skip sentence separators if (line.startsWith("^ ^_^")) continue; if (iter.hasNext() == false) { // out of sync throw new ExecutionException( "Impossible to synchronise tokens with output of POS tagger"); } // extract information about lemmas and suffix // e.g. a a_AT1 &rasp_colon;0.997075 a_ZZ1 &rasp_colon;0.0022923 // a_II &rasp_colon;0.00063225 // and store it in the WordForms // detect patterns in the line Matcher match = splitpattern.matcher(line); while (match.find()) { // we want match 1 String lemma = match.group(1); String suffix = ""; // separate the lemma from the suffix int suffix_offset = lemma.lastIndexOf("+"); if (suffix_offset > 0) { suffix = lemma.substring(suffix_offset); lemma = lemma.substring(0, suffix_offset); } Annotation wf = iter.next(); wf.getFeatures().put("lemma", lemma); wf.getFeatures().put("suffix", suffix); } } // more tokens available? if (iter.hasNext()) { // out of synch throw new ExecutionException( "Impossible to synchronise tokens with output of POS tagger"); } } private final void dumpToFile(StringBuffer buffer) { File tempForms; try { tempForms = java.io.File.createTempFile("rasp", ".lemmas"); BufferedWriter writer = new BufferedWriter( new FileWriter(tempForms)); writer.write(buffer.toString()); writer.close(); } catch (IOException e) { e.printStackTrace(); } } public String getCharset() { return charset; } public void setCharset(String charset) { this.charset = charset; } public Boolean getDebug() { return new Boolean(debug); } public void setDebug(Boolean debug) { this.debug = debug; } public String getInputASName() { return inputASName; } public void setInputASName(String inputASName) { this.inputASName = inputASName; } public URL getRaspHome() { return raspHome; } public void setRaspHome(URL raspHome) { this.raspHome = raspHome; } }