GATE.ac.uk - releases/gate-7.0-build4195-ALL/plugins/OpenNLP/src/gate/opennlp/OpenNlpSentenceSplit.java

package gate.opennlp;

import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;

import java.io.DataInputStream;
import java.io.IOException;
import java.net.URL;
import java.util.zip.GZIPInputStream;

import opennlp.maxent.MaxentModel;
import opennlp.maxent.io.BinaryGISModelReader;
import opennlp.tools.sentdetect.SentenceDetectorME;

import org.apache.log4j.Logger;

/**
 * Wrapper for the open nlp sentence splitter
 * 
 * @author <A
 *         HREF="mailto:georgiev@ontotext.com>georgi.georgiev@ontotext.com</A>
 *         Created: Thu Dec 11 16:25:59 EET 2008
 */

public @SuppressWarnings("all")
class OpenNlpSentenceSplit extends AbstractLanguageAnalyser {

	public static final long serialVersionUID = 1L;

	String inputASName;

	public String getInputASName() {
		return inputASName;
	}

	public void setInputASName(String inputASName) {
		this.inputASName = inputASName;
	}

	private static final Logger logger = Logger
			.getLogger(OpenNlpSentenceSplit.class);

	// private members
	private String annotationSetName = null;
	SentenceDetectorME splitter = null;
	URL model;

	@Override
	public void execute() throws ExecutionException {
		boolean isSentenceSplitted = false;
		// text doc annotations
		AnnotationSet annotations;
		if (annotationSetName != null && annotationSetName.length() > 0)
			annotations = document.getAnnotations(annotationSetName);
		else
			annotations = document.getAnnotations();
		// getdoc.get text
		String text = document.getContent().toString();
		// run tokenizer
		int[] spans = splitter.sentPosDetect(text);
		// compare the resulting
		// sentences and add annotations
		int prevSpan = 0;
		for (int i = 0; i < spans.length; i++) {

			FeatureMap fm = Factory.newFeatureMap();
			// type
			fm.put("source", "openNLP");
			// source
			// fm.put("type", "urn:lsid:ontotext.com:kim:iextraction:Sentence");

			try {
				// annotations.add(Long.valueOf(spans[i].getStart()),
				// Long.valueOf(spans[i].getEnd()), "Sentence", fm);
				// annotations.add(i == 0 ? Long.valueOf(prevSpan) : Long
				// .valueOf(prevSpan + countSpaces(prevSpan - 1)),
				// i == (spans.length - 1) ? Long.valueOf(spans[i]) : Long
				// .valueOf(spans[i] - 1), "Sentence", fm);
				int start = prevSpan;
				int end = spans[i];

				// remove leading spaces of a sentence
				for (int j = start; j < end
						&& Character.isWhitespace(text.charAt(j)); j++) {
					start = j + 1;
				}

				// remove trailing spaces of a sentence
				if (end > 1) {
					for (int j = end; j > start
							&& Character.isWhitespace(text.charAt(j - 1)); j--) {
						end = j - 1;
					}
				}

				annotations.add(Long.valueOf(start), Long.valueOf(end),
						"Sentence", fm);
				if(!isSentenceSplitted)
					isSentenceSplitted = true;

			} catch (InvalidOffsetException e) {
				e.printStackTrace();
				throw new RuntimeException(e);
			}

			prevSpan = spans[i];
		}
		if(!isSentenceSplitted){
			FeatureMap fm = Factory.newFeatureMap();
			// type
			fm.put("source", "openNLP");
			try {
				annotations.add(new Long(0), new Long(text.length()),
						"Sentence", fm);
			} catch (InvalidOffsetException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	int countSpaces(int lastSpan) {

		int ws = 0;
		String text = document.getContent().toString();

		char[] context = text.substring(lastSpan - 1,
				text.length() >= lastSpan + 50 ? lastSpan + 50 : text.length())
				.toCharArray();

		for (int i = 0; i < context.length; i++) {
			if (Character.isWhitespace(context[i]))
				ws++;
			else
				break;
		}

		return ws;
	}

	@Override
	public Resource init() throws ResourceInstantiationException {
		// logger.info("Sentence split url is: " + model.getFile());
		try {
			splitter = new SentenceDetectorME(getModel(model));
		} catch (Exception e) {
			logger.error("Sentence Splitter can not be initialized!");
			throw new RuntimeException(
					"Sentence Splitter cannot be initialized!", e);
		}

		logger.warn("Sentence split initialized!");// System.out.println("Sentence split initialized!");

		return this;

	}

	@Override
	public void reInit() throws ResourceInstantiationException {
		init();
	}

	/**
	 * @author joro
	 * @return MaxentModel
	 * @param String
	 *            path to MaxentModel
	 */
	public static MaxentModel getModel(URL name) {
		try {
			return new BinaryGISModelReader(new DataInputStream(
					new GZIPInputStream(name.openStream()))).getModel();
		} catch (IOException E) {
			E.printStackTrace();
			return null;
		}
	}

	/* getters and setters for the PR */
	/* public members */

	public void setAnnotationSetName(String a) {
		annotationSetName = a;
	}

	public String getAnnotationSetName() {
		return annotationSetName;
	}/* getters and setters for the PR */

	public URL getModel() {
		return model;
	}

	public void setModel(URL model) {
		this.model = model;
	}

}