Log in Help
Print
Homereleasesgate-7.0-build4195-ALLpluginsOpenNLPsrcgateopennlp 〉 OpenNlpPOS.java
 
package gate.opennlp;

import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.BomStrippingInputStreamReader;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import opennlp.maxent.MaxentModel;
import opennlp.maxent.io.BinaryGISModelReader;
import opennlp.tools.postag.POSDictionary;
import opennlp.tools.postag.POSTaggerME;

import org.apache.log4j.Logger;

/**
 * Wrapper for the open nlp pos tagger
 *
 * @author <A
 *         HREF="mailto:georgiev@ontotext.com">georgi.georgiev@ontotext.com</A>
 *         Created: Thu Dec 11 16:25:59 EET 2008
 */

public @SuppressWarnings("all")
class OpenNlpPOS extends AbstractLanguageAnalyser {

	public static final long serialVersionUID = 1L;

	private static final Logger logger = Logger.getLogger(OpenNlpPOS.class);

	String inputASName;

	public String getInputASName() {
		return inputASName;
	}

	public void setInputASName(String inputASName) {
		this.inputASName = inputASName;
	}

	// private members
	private String annotationSetName = null;
	POSTaggerME pos = null;
	URL model;
	URL dictionary;
	private String dictionaryEncoding = "UTF-8";

	@Override
	public void execute() throws ExecutionException {
		// text doc annotations
		AnnotationSet annotations;
		if (annotationSetName != null && annotationSetName.length() > 0)
			annotations = document.getAnnotations(annotationSetName);
		else
			annotations = document.getAnnotations();

		// getdoc.get text
		String text = document.getContent().toString();

		// get sentence annotations
		AnnotationSet sentences = annotations.get("Sentence");

		// order sentences

		List<Annotation> sentList = new LinkedList<Annotation>();

		for (Iterator iterator = sentences.iterator(); iterator.hasNext();) {
			sentList.add((Annotation) iterator.next());

		}

		java.util.Collections.sort(sentList, new gate.util.OffsetComparator());

		// for each sentence get token annotations
		for (Iterator iterator = sentList.iterator(); iterator.hasNext();) {
			Annotation annotation = (Annotation) iterator.next();

			AnnotationSet sentenceTokens = annotations.get("Token", annotation
					.getStartNode().getOffset(), annotation.getEndNode()
					.getOffset());

			// create a list

			List<Annotation> tokenList = new LinkedList<Annotation>();

			for (Iterator iterator2 = sentenceTokens.iterator(); iterator2
					.hasNext();) {
				tokenList.add((Annotation) iterator2.next());

			}

			// order on offset

			Collections.sort(tokenList, new gate.util.OffsetComparator());

			// make the array be string[] sentence
			String[] sentence = new String[tokenList.size()];
			int i = 0;
			for (Iterator iterator2 = tokenList.iterator(); iterator2.hasNext();) {

				Annotation token = (Annotation) iterator2.next();

				sentence[i] = token.getFeatures().get("string").toString()
						.replaceAll("\\s+", "").trim();

				i++;
			}

			StringBuffer buf = new StringBuffer();
			for (int j = 0; j < sentence.length; j++) {
				buf.append(sentence[j] + "@@@");
			}

			// run pos tagger
			String[] postags = null;
			/**
			 * we will make shure to not allow smth to breack the tagger
			 */
			try {
				postags = pos.tag(sentence);
			} catch (Exception e) {
				e.printStackTrace();
				System.out
						.println("There is a problem....\n with this sentence");
				System.out.println(buf);
				continue;
			}

			// add tohose spans to token annotations

			int j = 0;
			for (Iterator iterator2 = tokenList.iterator(); iterator2.hasNext();) {
				Annotation token = (Annotation) iterator2.next();

				FeatureMap fm = token.getFeatures();
				fm.put("category", postags[j]);

				token.setFeatures(fm);

				j++;

			}
		}
	}

	@Override
	public Resource init() throws ResourceInstantiationException {
		// logger.warn("OpenNLP POS initializing strings are: model - " +
		// model.getFile() +
		// " dictionary: "+dictionary.getFile());
		try {
			BufferedReader dictionaryReader = new BomStrippingInputStreamReader(dictionary.openStream(),
							dictionaryEncoding);
			pos = new POSTaggerME(getModel(model), new POSDictionary(
					dictionaryReader, true));
		} catch (IOException e) {
			e.printStackTrace();
			logger.error("OpenNLP POS can not be initialized!");
			throw new RuntimeException("OpenNLP POS can not be initialized!", e);
		}
		logger.warn("OpenNLP POS initialized!");// System.out.println("OpenNLP POS initialized!");
		return this;

	}

	@Override
	public void reInit() throws ResourceInstantiationException {
		init();
	}

	/**
	 * @author georgiev
	 * @return MaxentModel
	 * @param String
	 *            path to MaxentModel
	 */
	public static MaxentModel getModel(URL name) {
		try {
			return new BinaryGISModelReader(new DataInputStream(
					new GZIPInputStream(name.openStream()))).getModel();
		} catch (IOException E) {
			E.printStackTrace();
			return null;
		}
	}

	/* getters and setters for the PR */
	/* public members */

	public void setAnnotationSetName(String a) {
		annotationSetName = a;
	}

	public String getAnnotationSetName() {
		return annotationSetName;
	}

	public URL getModel() {
		return model;
	}

	public void setModel(URL model) {
		this.model = model;
	}

	public URL getDictionary() {
		return dictionary;
	}

	public void setDictionary(URL dictionary) {
		this.dictionary = dictionary;
	}

	public void setDictionaryEncoding(String a) {
		dictionaryEncoding = a;
	}

	public String getDictionaryEncoding() {
		return dictionaryEncoding;
	}

}