package com.digitalpebble.rasp2.token;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceData;
import gate.creole.ResourceInstantiationException;
import gate.util.Files;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import com.digitalpebble.util.StreamReader;
import com.digitalpebble.util.StreamWriter;
/*******************************************************************************
* Simple tokenizer which is partially compatible with the default GATE one.
* Generates Tokens which have a single feature 'string'. Should be faster than
* the default GATE component.
******************************************************************************/
public class RASPTokenizer extends AbstractLanguageAnalyser {
private String inputASName;
private String outputASName;
private String charset = "ISO-8859-1";
private boolean debug = false;
private String tokenExecutable;
public Resource init() throws ResourceInstantiationException {
// check that the tokenizer is in the resource
// directory
ResourceData thisRD =
(ResourceData)Gate.getCreoleRegister().get(this.getClass().getName());
URL myCreoleXML = thisRD.getXmlFileUrl();
if(!"file".equals(myCreoleXML.getProtocol())) {
throw new ResourceInstantiationException(
"Tokenizer plugin must be loaded from a file: URL");
}
File myCreoleXMLFile = Files.fileFromURL(myCreoleXML);
tokenExecutable = myCreoleXMLFile.getParent()+ File.separator + "resources" + File.separator + "tokenise"
+ File.separator + "token."
+ com.digitalpebble.util.Utilities.getArch();
// check that the file exists
File scriptfile = new File(tokenExecutable);
if (scriptfile.exists() == false)
throw new ResourceInstantiationException(new Exception(
"Executable " + scriptfile.getAbsolutePath()
+ " does not exist"));
return super.init();
}
public void execute() throws ExecutionException {
AnnotationSet inputAS = (inputASName == null || inputASName.equals("")) ? document
.getAnnotations()
: document.getAnnotations(inputASName);
AnnotationSet outputAS = (outputASName == null || outputASName
.equals("")) ? document.getAnnotations() : document
.getAnnotations(outputASName);
if (getCharset() != null && Charset.isSupported(getCharset()))
charset = getCharset();
File exec = new File(tokenExecutable);
String[] cmdline = new String[] { exec.toString() };
// run the lemmer and convert output into annotations
Process process = null;
Thread sw = null;
Thread srt = null;
File tempSentences;
try {
tempSentences = java.io.File.createTempFile("rasp", ".sent");
} catch (IOException e) {
throw new ExecutionException(e);
}
try {
// generate an input file for the Tokeniser
OutputStream fout = new FileOutputStream(tempSentences);
OutputStreamWriter out = new OutputStreamWriter(fout, charset);
BufferedWriter writer = new BufferedWriter(out);
List sentences = new ArrayList(inputAS.get("Sentence"));
Collections.sort(sentences, new OffsetComparator());
Iterator iter = sentences.iterator();
// no sentences?
// skip the processing
if (iter.hasNext() == false) {
System.err.println("RASP Tokenizer needs annotations of type Sentence as input");
return;
}
else
while (iter.hasNext()) {
writer.append("^ ");
Annotation sentence = (Annotation) iter.next();
String sentenceT = getDocument().getContent().getContent(
sentence.getStartNode().getOffset(),
sentence.getEndNode().getOffset()).toString();
sentenceT = sentenceT.replaceAll("\n", " ");
writer.append(sentenceT).append("\n");
}
writer.close();
process = Runtime.getRuntime().exec(cmdline);
// pass the content of the file to the buffer of the process
// in a different Thread
FileInputStream tempin = new FileInputStream(tempSentences);
sw = new Thread(new StreamWriter(tempin, process.getOutputStream()));
StreamReader sr = new StreamReader(process.getInputStream(),
charset);
sw.start();
// read the information returned by the application
// in a different Thread
srt = new Thread(sr);
srt.start();
// wait for the process to finish
process.waitFor();
if (debug)
dumpToFile(sr.getBuffer());
// need to wait for the reader to get everything else
sw.join();
srt.join();
// get the buffer from the StreamReader
processTokens(outputAS, sr.getBuffer().toString());
if (!debug)
tempSentences.delete();
} catch (Exception err) {
System.out.println("Problem when calling the executable");
throw new ExecutionException(err);
} finally {
// destroy the process
process.destroy();
try {
sw.join();
srt.join();
} catch (InterruptedException e) {
}
}
}
// each line corresponds to a sentence
// each token is separated by a space character
private void processTokens(AnnotationSet out, String buffer)
throws ExecutionException {
String originalText = getDocument().getContent().toString();
String[] lines = buffer.split("\n");
int lastPos = 0;
// read output from tokeniser
for (int i = 0; i < lines.length; i++) {
String[] splits = lines[i].split(" ");
// skip the sentence marker
for (int s = 1; s < splits.length; s++) {
String target = splits[s];
// skip empty strings
if (target.length()==0) continue;
int start = originalText.indexOf(target, lastPos);
int end = start + target.length();
lastPos = end;
if (start == -1) {
throw new ExecutionException(new Exception(
"Can't match token " + target));
}
// create new annotation
FeatureMap features = Factory.newFeatureMap();
features.put("string", originalText.substring(start, end));
features.put("length", originalText.length());
try {
out.add(new Long(start), new Long(end), "Token", features);
} catch (InvalidOffsetException e) {
throw new ExecutionException(e);
}
}
}
}
private final void dumpToFile(StringBuffer buffer) {
File tempForms;
try {
tempForms = java.io.File.createTempFile("rasp", ".tokenoutput");
BufferedWriter writer = new BufferedWriter(
new FileWriter(tempForms));
writer.write(buffer.toString());
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}
public Boolean getDebug() {
return debug;
}
public void setDebug(Boolean debug) {
this.debug = debug;
}
public String getInputASName() {
return inputASName;
}
public void setInputASName(String inputASName) {
this.inputASName = inputASName;
}
public String getOutputASName() {
return outputASName;
}
public void setOutputASName(String outputASName) {
this.outputASName = outputASName;
}
}