Log in Help
Print
Homereleasesgate-5.1-beta2-build3402-ALLpluginsOpenNLPsrcgateopennlp 〉 OpenNlpChunker.java
 
package gate.opennlp;

import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.log4j.Logger;

import opennlp.maxent.MaxentModel;
import opennlp.maxent.io.BinaryGISModelReader;
import opennlp.tools.chunker.ChunkerME;

/**
 * Wrapper for the opennlp chuncker
 * @author <A HREF="mailto:georgiev@ontotext.com>georgi.georgiev@ontotext.com</A>
 * Created: Thu Dec 11 16:25:59 EET 2008
 */


public @SuppressWarnings("all") class OpenNlpChunker extends AbstractLanguageAnalyser {

	public static final long serialVersionUID = 1L;
	
	private static final Logger logger = Logger.getLogger(OpenNlpChunker.class);

	/**
	 * @author georgi georgiev
	 * @return MaxentModel
	 * @param String
	 *            path to MaxentModel
	 */
	public static MaxentModel getModel(File name) {
		try {
			return new BinaryGISModelReader(new DataInputStream(
					new GZIPInputStream(new FileInputStream(name)))).getModel();
		} catch (IOException E) {
			E.printStackTrace();
			return null;
		}
	}
	// private members
	private String inputASName = null;
	ChunkerME chunker = null;

	private java.net.URL model;

	@SuppressWarnings("unchecked")
	@Override
	public void execute() throws ExecutionException {
		// text doc annotations
		AnnotationSet annotations;
		if (inputASName != null && inputASName.length() > 0) {
			annotations = document.getAnnotations(inputASName);
		} else {
			annotations = document.getAnnotations();
		}

		// getdoc.get text
		//String text = document.getContent().toString();

		// get sentence annotations
		AnnotationSet sentences = annotations.get("Sentence");

		//order them
		List<Annotation> sentList = new LinkedList<Annotation>();

		for (Iterator iterator = sentences.iterator(); iterator.hasNext();) {
			sentList.add( (Annotation) iterator.next());

		}

		java.util.Collections.sort(sentList, new gate.util.OffsetComparator());

		// for each sentence get token annotations
		for (Iterator iterator = sentList.iterator(); iterator.hasNext();) {
			Annotation annotation = (Annotation) iterator.next();

			AnnotationSet sentenceTokens = annotations.get(
					"Token", annotation.getStartNode().getOffset(),
					annotation.getEndNode().getOffset());

			//create a list

			List<Annotation> annList = new LinkedList<Annotation>();

			for (Iterator<Annotation> iterator2 = sentenceTokens.iterator(); iterator2
					.hasNext();) {
				annList.add(iterator2.next());

			}

			//order on offset
			Collections.sort(annList, new gate.util.OffsetComparator());

			// make the array be string[] sentence
			String[] tokens = new String[sentenceTokens.size()];
			String[] postags = new String[sentenceTokens.size()];
			int i = 0;
			for (Iterator iterator3 = annList.iterator(); iterator3
					.hasNext();) {

				Annotation token = (Annotation) iterator3.next();

				tokens[i] = token.getFeatures().get("string").toString();
				postags[i] = token.getFeatures().get("category").toString();

				i++;
			}

			// run pos chunker
			String[] chunks = chunker.chunk(tokens, postags);

			// add tohose chunk tags to token annotations

			int j = 0;

			for (Iterator iterator4 =annList.iterator(); iterator4
					.hasNext();) {
				Annotation token = (Annotation) iterator4.next();

				FeatureMap fm = token.getFeatures();
				fm.put("chunk", chunks[j]);

				token.setFeatures(fm);

				j++;

			}
		}
	}

	public String getInputASName() {
		return inputASName;
	}/* getters and setters for the PR */

	public java.net.URL getModel() {
		return model;
	}


	/* getters and setters for the PR */
	/* public members */

	@Override
	public Resource init() throws ResourceInstantiationException {
//		logger.error("Chunker url is: " + model.getFile());
		try {
			String file = null;
			if (model == null)
				file = "plugins/openNLP/models/english/chunker/EnglishChunk.bin.gz";
			else
				file = model.getFile();
			
		chunker = new ChunkerME(getModel(new File(file)));
				//"/home/joro/work/m_learning/models/Chunker_Genia.bin.gz"));
		}catch (Exception e){
			e.printStackTrace();
			logger.error("Chunker can not be initialized!");
			throw new RuntimeException("Sentence Splitter cannot be initialized!", e);
		}

		return this;

	}

	@Override
	public void reInit() throws ResourceInstantiationException {
		init();
	}

	public void setInputASName(String a) {
		inputASName = a;
	}

	public void setModel(java.net.URL model) {
		this.model = model;
	}

}