GATE.ac.uk - gate/plugins/Parser_RASP/src/com/digitalpebble/rasp2/token/RASPTokenizer.java

package com.digitalpebble.rasp2.token;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceData;
import gate.creole.ResourceInstantiationException;
import gate.util.Files;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import com.digitalpebble.util.StreamReader;
import com.digitalpebble.util.StreamWriter;

/*******************************************************************************
 * Simple tokenizer which is partially compatible with the default GATE one.
 * Generates Tokens which have a single feature 'string'. Should be faster than
 * the default GATE component.
 ******************************************************************************/
public class RASPTokenizer extends AbstractLanguageAnalyser {

	private String inputASName;

	private String outputASName;

	private String charset = "ISO-8859-1";

	private boolean debug = false;

	private String tokenExecutable;

	public Resource init() throws ResourceInstantiationException {
		// check that the tokenizer is in the resource
		// directory
	    ResourceData thisRD =
	      (ResourceData)Gate.getCreoleRegister().get(this.getClass().getName());
	    URL myCreoleXML = thisRD.getXmlFileUrl();
	    if(!"file".equals(myCreoleXML.getProtocol())) {
	      throw new ResourceInstantiationException(
	          "Tokenizer plugin must be loaded from a file: URL");
	    }
	    File myCreoleXMLFile = Files.fileFromURL(myCreoleXML);
	    
		tokenExecutable = myCreoleXMLFile.getParent()+ File.separator + "resources" + File.separator + "tokenise"
				+ File.separator + "token."
				+ com.digitalpebble.util.Utilities.getArch();

		// check that the file exists
		File scriptfile = new File(tokenExecutable);
		if (scriptfile.exists() == false)
			throw new ResourceInstantiationException(new Exception(
					"Executable " + scriptfile.getAbsolutePath()
							+ " does not exist"));
		return super.init();
	}

	public void execute() throws ExecutionException {
		AnnotationSet inputAS = (inputASName == null || inputASName.equals("")) ? document
				.getAnnotations()
				: document.getAnnotations(inputASName);

		AnnotationSet outputAS = (outputASName == null || outputASName
				.equals("")) ? document.getAnnotations() : document
				.getAnnotations(outputASName);

		if (getCharset() != null && Charset.isSupported(getCharset()))
			charset = getCharset();

		File exec = new File(tokenExecutable);
		String[] cmdline = new String[] { exec.toString() };

		// run the lemmer and convert output into annotations
		Process process = null;
		Thread sw = null;
		Thread srt = null;

		File tempSentences;
		try {
			tempSentences = java.io.File.createTempFile("rasp", ".sent");
		} catch (IOException e) {
			throw new ExecutionException(e);
		}

		try {
			// generate an input file for the Tokeniser
			OutputStream fout = new FileOutputStream(tempSentences);
			OutputStreamWriter out = new OutputStreamWriter(fout, charset);
			BufferedWriter writer = new BufferedWriter(out);

			List sentences = new ArrayList(inputAS.get("Sentence"));
			Collections.sort(sentences, new OffsetComparator());
			Iterator iter = sentences.iterator();

			// no sentences?
			// skip the processing
			if (iter.hasNext() == false) {
				System.err.println("RASP Tokenizer needs annotations of type Sentence as input");
				return;
			}

			else
				while (iter.hasNext()) {
					writer.append("^ ");
					Annotation sentence = (Annotation) iter.next();
					String sentenceT = getDocument().getContent().getContent(
							sentence.getStartNode().getOffset(),
							sentence.getEndNode().getOffset()).toString();
					sentenceT = sentenceT.replaceAll("\n", " ");
					writer.append(sentenceT).append("\n");
				}
			writer.close();

			process = Runtime.getRuntime().exec(cmdline);
			// pass the content of the file to the buffer of the process
			// in a different Thread
			FileInputStream tempin = new FileInputStream(tempSentences);
			sw = new Thread(new StreamWriter(tempin, process.getOutputStream()));
			StreamReader sr = new StreamReader(process.getInputStream(),
					charset);
			sw.start();
			// read the information returned by the application
			// in a different Thread
			srt = new Thread(sr);
			srt.start();
			// wait for the process to finish
			process.waitFor();

			if (debug)
				dumpToFile(sr.getBuffer());

			// need to wait for the reader to get everything else
			sw.join();
			srt.join();

			// get the buffer from the StreamReader
			processTokens(outputAS, sr.getBuffer().toString());

			if (!debug)
				tempSentences.delete();

		} catch (Exception err) {
			System.out.println("Problem when calling the executable");
			throw new ExecutionException(err);
		} finally {
			// destroy the process
			process.destroy();
			try {
				sw.join();
				srt.join();
			} catch (InterruptedException e) {
			}
		}

	}

	// each line corresponds to a sentence
	// each token is separated by a space character
	private void processTokens(AnnotationSet out, String buffer)
			throws ExecutionException {
		String originalText = getDocument().getContent().toString();

		String[] lines = buffer.split("\n");
		int lastPos = 0;
		// read output from tokeniser
		for (int i = 0; i < lines.length; i++) {
			String[] splits = lines[i].split(" ");
			// skip the sentence marker
			for (int s = 1; s < splits.length; s++) {
				String target = splits[s];
				// skip empty strings				
				if (target.length()==0) continue;
				int start = originalText.indexOf(target, lastPos);
				int end = start + target.length();
				lastPos = end;
				if (start == -1) {
					throw new ExecutionException(new Exception(
							"Can't match token " + target));
				}
				// create new annotation
				FeatureMap features = Factory.newFeatureMap();
				features.put("string", originalText.substring(start, end));
				features.put("length", originalText.length());
				try {
					out.add(new Long(start), new Long(end), "Token", features);
				} catch (InvalidOffsetException e) {
					throw new ExecutionException(e);
				}
			}
		}
	}

	private final void dumpToFile(StringBuffer buffer) {
		File tempForms;
		try {
			tempForms = java.io.File.createTempFile("rasp", ".tokenoutput");

			BufferedWriter writer = new BufferedWriter(
					new FileWriter(tempForms));
			writer.write(buffer.toString());
			writer.close();

		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public String getCharset() {
		return charset;
	}

	public void setCharset(String charset) {
		this.charset = charset;
	}

	public Boolean getDebug() {
		return debug;
	}

	public void setDebug(Boolean debug) {
		this.debug = debug;
	}

	public String getInputASName() {
		return inputASName;
	}

	public void setInputASName(String inputASName) {
		this.inputASName = inputASName;
	}

	public String getOutputASName() {
		return outputASName;
	}

	public void setOutputASName(String outputASName) {
		this.outputASName = outputASName;
	}

}