GATE.ac.uk - gate/plugins/Parser_RASP/src/com/digitalpebble/rasp2/parser/ParserAnnotator.java

package com.digitalpebble.rasp2.parser;

import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.xml.sax.InputSource;

import com.digitalpebble.util.StreamReader;
import com.digitalpebble.util.StreamWriter;
import com.digitalpebble.util.Utilities;

/**
 * Wrapper for the RASP Parser, which generates dependencies between Tokens but
 * also a syntactic analysis. This annotator requires a preliminary analysis
 * with the POS tagger and morphological analyser
 */
public class ParserAnnotator extends AbstractLanguageAnalyser {

	// static String[] outputparamvalues = new String[] { "-oa", "-ot", "-og",
	// 		"-otg", "-ogio", "-ogw", "-otgio" };

	static String[] outputparamvalues = new String[] {"-og","-ogio", "-ogw" };
	
	// parameters specified via the GUI

	// one of the values specified above
	private String outputFormat;

	private Integer parseNum;

	private Integer time;

	private Boolean subcategorisation;

	private Boolean phrasalVerbs;

	private String inputASName;

	private String outputASName;

	private String charset = "ISO-8859-1";

	private boolean debug = false;

	private URL raspHome = null;

	// built from the value of rasphome
	private String parserScript;

	DocumentBuilder builder;

	/**
	 * Send each sentence to the parser instead of the entire document Can be
	 * slower but is also safer and avoids memory issues. -1 indicates that the
	 * entire document has to be sent in one batch, 1 or more indicates that n
	 * sentences have to be sent TODO add an option to modify that
	 */
	private Integer sentenceBatch = new Integer(-1);

	public Resource init() throws ResourceInstantiationException {

		if (getRaspHome() == null) {
			throw new ResourceInstantiationException(new Exception(
					"location of rasp not set"));
		}

		parserScript = getRaspHome().getFile() + File.separator + "scripts"
				+ File.separator + "rasp_parse.sh";

		// check that the file exists
		File scriptfile = new File(parserScript);
		if (scriptfile.exists() == false)
			throw new ResourceInstantiationException(new Exception("Script "
					+ scriptfile.getAbsolutePath() + " does not exist"));

		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
		try {
			builder = factory.newDocumentBuilder();
		} catch (ParserConfigurationException e) {
			throw new ResourceInstantiationException(e);
		}

		return super.init();
	}

	public void execute() throws ExecutionException {
		AnnotationSet inputAS = (inputASName == null || inputASName.equals("")) ? document
				.getAnnotations() : document.getAnnotations(inputASName);

		AnnotationSet outputAS = (outputASName == null || outputASName.equals("")) ? document
				.getAnnotations() : document.getAnnotations(outputASName);

		boolean noWordForms = inputAS.get("WordForm").isEmpty();		
				
		if (noWordForms){
			System.err.println("RASP Parser needs annotations of type WordForm - skipping document");
		}
		
		String encoding = getCharset();
		if (encoding != null && Charset.isSupported(encoding))
			charset = encoding;

		// check that the output format is in the list
		boolean inList = java.util.Arrays.asList(outputparamvalues).contains(
				getOutputFormat());
		if (inList == false) {
			// unknown format -> ignored
			throw new ExecutionException("Value of output param not allowed ["
					+ getOutputFormat() + "]");
		}
		// do we call the parser for each sentence
		// or the whole document
		if (sentenceBatch == 1) {
			processBySentence(inputAS,outputAS);
		} else
			processAll(inputAS,outputAS);
	}

	public void processAll(AnnotationSet input, AnnotationSet output) throws ExecutionException {
		File tempForms;
		try {
			tempForms = java.io.File.createTempFile("rasp", ".data");
			this.generateInputForParser(input, tempForms, -1);
		} catch (Exception e) {
			throw new ExecutionException(e);
		}
		// the next step consists in calling the morpho scripts
		// and modify the annotations in the CAS accordingly
		ParserXMLoutputAnalyser parser = new ParserXMLoutputAnalyser(input,output);
		callExternalCommand(parser, tempForms, false);

		if (!debug)
			tempForms.delete();
	}

	public void processBySentence(AnnotationSet input, AnnotationSet output)
			throws ExecutionException {
		ParserXMLoutputAnalyser parser = new ParserXMLoutputAnalyser(input,output);

		List sents = new ArrayList(input.get("Sentence"));
		java.util.Collections.sort(sents, new OffsetComparator());
		Iterator sentenceIterator = sents.iterator();
		int sentNum = 0;
		while (sentenceIterator.hasNext()) {
			sentNum++;
			sentenceIterator.next();
			File tempForms;
			try {
				tempForms = java.io.File.createTempFile("rasp", ".data_"
						+ sentNum);
				this.generateInputForParser(input, tempForms, sentNum);
			} catch (Exception e) {
				throw new ExecutionException(e);
			}
			// the next step consists in calling the morpho scripts
			// and modify the annotations in the CAS accordingly
			callExternalCommand(parser, tempForms, true);

			if (!debug)
				tempForms.delete();
		}
	}

	// We want to generate things like
	// ^ ^_^:1
	// This This_DD1 &rasp_colon;1
	// is is_VBZ &rasp_colon;1
	// a a_ZZ1 &rasp_colon;4.96223e-05 a_II &rasp_colon;0.000225492 a_AT1
	// &rasp_colon;0.999725
	// test test_NN1 &rasp_colon;0.994738 test_VV0 &rasp_colon;0.00526216
	// @returns a list of Token annotations
	// @param sentenceNumber = number of the sentence to generate; -1 to
	// generate them all
	private List<Annotation> generateInputForParser(AnnotationSet inputAS,
			File outputFile, int sentenceNumber) throws IOException,
			InvalidOffsetException {
		OutputStream fout = new FileOutputStream(outputFile);
		OutputStreamWriter out = new OutputStreamWriter(fout, charset);
		BufferedWriter writer = new BufferedWriter(out);

		// for each sentence dump the information about the WordForms for each
		// token
		List sents = new ArrayList(inputAS.get("Sentence"));
		java.util.Collections.sort(sents, new OffsetComparator());
		Iterator sentenceIterator = sents.iterator();

		AnnotationSet wfs = inputAS.get("WordForm");

		List<Annotation> rewordforms = new ArrayList<Annotation>(wfs.size());

		// AnnotationIndex wordformAnnotationIndex =
		// cas.getAnnotationIndex(WordForm.type);
		// FSIterator wordFormIterator = wordformAnnotationIndex.iterator();
		int sentNum = 0;
		while (sentenceIterator.hasNext()) {
			sentNum++;
			Annotation sentence = (Annotation) sentenceIterator.next();
			// check that this is the one we want
			if (sentenceNumber != -1) {
				if (sentNum < sentenceNumber) {
					continue;
				}
				if (sentNum > sentenceNumber) {
					break;
				}
			}
			writer.append("^ ^_^:1\n");

			// get the Tokens (or word forms?) located under that sentence

			ArrayList wordForms = new ArrayList(wfs.getContained(sentence
					.getStartNode().getOffset(), sentence.getEndNode()
					.getOffset()));
			java.util.Collections.sort(wordForms, new OffsetComparator());
			Iterator<Annotation> iter = wordForms.iterator();

			// create a single entry for word forms located at the same position
			Long previousstart = null;
			Long previousend = null;

			boolean isFirst = true;

			while (iter.hasNext()) {
				Annotation a = iter.next();
				FeatureMap fm = a.getFeatures();
				String form = (String) fm.get("string");
				String pos = (String) fm.get("pos");
				Double prob = (Double) fm.get("probability");
				String lemma = (String) fm.get("lemma");
				String suffix = (String) fm.get("suffix");

				Long laststartoffset = a.getStartNode().getOffset();
				Long lastendoffset = a.getEndNode().getOffset();

				// do we have a new entity?
				if (laststartoffset != previousstart
						|| lastendoffset != previousend) {
					// finish the line
					if (isFirst == false) {
						writer.newLine();
					}
					isFirst = false;
					// dump the form as found in the text
					String formToken = getDocument().getContent().getContent(
							laststartoffset, lastendoffset).toString();

					writer.append(formToken);
				}
				// add the rest anyway
				writer.append(" ");
				writer.append("<w id=\"" + a.getId()).append("\">");
				writer.append(lemma);
				writer.append(suffix).append("_").append(pos);
				writer.append("</w>");
				writer.append(":" + Double.toString(prob));

				rewordforms.add(a);
				previousstart = laststartoffset;
				previousend = lastendoffset;
			}
			writer.newLine();
		}
		writer.flush();
		writer.close();
		return rewordforms;
	}

	private void callExternalCommand(ParserXMLoutputAnalyser parser,
			File inputFile, boolean perSentence) {
		File RASPexec = new File(this.parserScript);
		
		List params = new ArrayList();
		if (time!=null){
			params.add( "-t");
			params.add(this.time.toString());
		}
		
		if (this.parseNum!=null){
			params.add( "-n");
			params.add(this.parseNum.toString());
		}
		
		params.add(outputFormat);
		
		// build the command line
		String[] parameters = new String[params.size()];
		parameters = (String[]) params.toArray(parameters);

		int lengthParams = parameters.length;

		// subcategorisation
		if (subcategorisation.booleanValue() == true)
			lengthParams++;

		// no phrasal verbs?
		if (phrasalVerbs.booleanValue() == false)
			lengthParams++;

		// need to add the executable + specify XML format
		String[] cmdline = new String[lengthParams + 2];
		System.arraycopy(parameters, 0, cmdline, 1, parameters.length);
		cmdline[0] = RASPexec.getAbsolutePath();

		if (subcategorisation.booleanValue() == true)
			cmdline[parameters.length + 1] = "-s";

		if (phrasalVerbs.booleanValue() == false)
			cmdline[parameters.length + 2] = "-x";

		cmdline[cmdline.length - 1] = "-y";

		// run the lemmer and convert output into annotations
		Process process = null;
		Thread sw = null;
		Thread srt = null;
		try {
			process = Runtime.getRuntime().exec(cmdline);
			// pass the content of the file to the buffer of the process
			// in a different Thread
			FileInputStream tempin = new FileInputStream(inputFile);
			sw = new Thread(new StreamWriter(tempin, process.getOutputStream()));

			StreamReader sr = new StreamReader(process.getInputStream(),
					this.charset);
			sw.start();
			// read the information returned by the application
			// in a different Thread
			srt = new Thread(sr);
			srt.start();
			// wait for the process to finish
			process.waitFor();
			// need to wait for the reader to get everything else
			sw.join();
			srt.join();
			// dump the content of the buffer into a file (for debug)
			if (debug)
				dumpToFile(sr.getBuffer());

			// get the buffer from the StreamReader
			String xmloutput = sr.getBuffer().toString();

			// updates the cas with the information contained in the XML
			if (!perSentence) {
				Document domDoc = this.builder.parse(new InputSource(
						new StringReader(xmloutput)));
				parser.parseRASPOutput(domDoc);
			} else {
				parser.parseRASPOutputSingleSentence(xmloutput, builder);
			}
		} catch (Exception err) {
			// System.out.println("Problem when calling the executable");
			// throw new AnalysisEngineProcessException(err);
			// Just log the exception message and continue with the annotation
			// of the documents
			String message = "Exception thrown in ParserAnnotator: "
					+ err.getMessage();
			System.err.println(message);
			// copy the input file under a different name so that
			// we can trace the problem
			File errorFile = new File(inputFile.getAbsolutePath() + ".error");
			try {
				errorFile.createNewFile();
				Utilities.copyFile(inputFile, errorFile);
			} catch (IOException e) {
			}

		} finally {
			// destroy the process
			process.destroy();
			try {
				sw.join();
				srt.join();
			} catch (InterruptedException e) {
			}
		}
	}

	private final void dumpToFile(StringBuffer buffer) {
		File tempForms;
		try {
			tempForms = java.io.File.createTempFile("rasp", ".parse");

			BufferedWriter writer = new BufferedWriter(
					new FileWriter(tempForms));
			writer.write(buffer.toString());
			writer.close();

		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public String getCharset() {
		return charset;
	}

	public void setCharset(String charset) {
		this.charset = charset;
	}

	public String getInputASName() {
		return inputASName;
	}

	public void setInputASName(String inputASName) {
		this.inputASName = inputASName;
	}

	public String getOutputFormat() {
		return outputFormat;
	}

	public void setOutputFormat(String outputFormat) {
		this.outputFormat = outputFormat;
	}

	public Integer getParseNum() {
		return parseNum;
	}

	public void setParseNum(Integer parseNum) {
		this.parseNum = parseNum;
	}

	public Boolean getPhrasalVerbs() {
		return phrasalVerbs;
	}

	public void setPhrasalVerbs(Boolean phrasalVerbs) {
		this.phrasalVerbs = phrasalVerbs;
	}

	public URL getRaspHome() {
		return raspHome;
	}

	public void setRaspHome(URL raspHome) {
		this.raspHome = raspHome;
	}

	public Boolean getSubcategorisation() {
		return subcategorisation;
	}

	public void setSubcategorisation(Boolean subcategorisation) {
		this.subcategorisation = subcategorisation;
	}

	public Integer getTime() {
		return time;
	}

	public void setTime(Integer time) {
		this.time = time;
	}

	public void setDebug(Boolean debug) {
		this.debug = debug;
	}

	public Boolean getDebug() {
		return debug;
	}

	public String getOutputASName() {
		return outputASName;
	}

	public void setOutputASName(String outputASName) {
		this.outputASName = outputASName;
	}

}