Log in Help
Print
Homereleasesgate-5.1-beta2-build3402-ALLpluginsOpenNLPsrcgateopennlp 〉 OpenNlpPOS.java
 
package gate.opennlp;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.log4j.Logger;

import opennlp.maxent.MaxentModel;
import opennlp.maxent.io.BinaryGISModelReader;
import opennlp.tools.postag.POSDictionary;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.util.Span;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;


/**
 * Wrapper for the open nlp pos tagger
 * @author <A HREF="mailto:georgiev@ontotext.com>georgi.georgiev@ontotext.com</A>
 * Created: Thu Dec 11 16:25:59 EET 2008
 */

public @SuppressWarnings("all") class OpenNlpPOS extends AbstractLanguageAnalyser {

	public static final long serialVersionUID = 1L;
	
	private static final Logger logger = Logger.getLogger(OpenNlpPOS.class);

	// private members
	private String inputASName = null;
	POSTaggerME pos = null;
	URL model;
	URL dictionary;
	
	
	@Override
	public void execute() throws ExecutionException {
		// text doc annotations
		AnnotationSet annotations;
		if (inputASName != null && inputASName.length() > 0)
			annotations = document.getAnnotations(inputASName);
		else
			annotations = document.getAnnotations();

		// getdoc.get text
		String text = document.getContent().toString();
		
		// get sentence annotations
		 AnnotationSet sentences = document.getAnnotations().get("Sentence");
		 
			
		//order sentences
		
		List<Annotation> sentList = new LinkedList<Annotation>();
		
		for (Iterator iterator = sentences.iterator(); iterator.hasNext();) {
			sentList.add( (Annotation) iterator.next());
			
		}
		
		java.util.Collections.sort(sentList, new gate.util.OffsetComparator());
		
		// for each sentence get token annotations
		 for (Iterator iterator = sentList.iterator(); iterator.hasNext();) {
			Annotation annotation = (Annotation) iterator.next();
			
			AnnotationSet sentenceTokens = document.getAnnotations().get("Token", 
					annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset());
			
			//create a list
			
			List<Annotation> tokenList = new LinkedList<Annotation>();
			
			for (Iterator iterator2 = sentenceTokens.iterator(); iterator2
					.hasNext();) {
				tokenList.add((Annotation) iterator2.next());
				
			}
			
			//order on offset
			
			Collections.sort(tokenList, new gate.util.OffsetComparator());
			
			//make the array be string[] sentence
			String[] sentence = new String[tokenList.size()];
			int i = 0;
			for (Iterator iterator2 = tokenList.iterator(); iterator2
					.hasNext();) {
				
				Annotation token = (Annotation) iterator2.next();
				
				sentence[i] = token.getFeatures().get("string").toString().replaceAll("\\s+", "").trim();
				
				i++;
			}
			
			StringBuffer buf = new StringBuffer();
			for (int j = 0; j < sentence.length; j++) {
				buf.append(sentence[j]+ "@@@");
			}
			
			//run pos tagger
			String[] postags = null;
			/**
			 * we will make shure to not 
			 * allow smth to breack the tagger
			 */
			try{
			postags = pos.tag(sentence);
			}catch (Exception e){
				e.printStackTrace();
				System.out.println("There is a problem....\n with this sentence");
				System.out.println(buf);
				continue;
			}
			
			//add tohose spans to token annotations
			
			int j = 0;
			for (Iterator iterator2 = tokenList.iterator(); iterator2
					.hasNext();) {
				Annotation token = (Annotation) iterator2.next();
				
				FeatureMap fm = token.getFeatures();
				fm.put("category", postags[j]);
				
				token.setFeatures(fm);
				
				j++;
				
			}
		}
	}

	@Override
	public Resource init() throws ResourceInstantiationException {
//		logger.warn("OpenNLP POS initializing strings are: model - " + model.getFile() + 
//				" dictionary: "+dictionary.getFile());
		try {
			
			String file = null;
			String lexicon = null;
			if (model == null||dictionary==null){
				file = "plugins/openNLP/models/english/postag/EnglishPOS.bin.gz";
				lexicon = "plugins/openNLP/models/english/postag/tagdict";
			}
			else{
				file = model.getFile();
				lexicon = dictionary.getFile();
			}
			
			pos = new POSTaggerME(getModel(new File(file)), new POSDictionary(
					lexicon));
		} catch (IOException e) {
			e.printStackTrace();
			logger.error("OpenNLP POS can not be initialized!");
			throw new RuntimeException("OpenNLP POS can not be initialized!", e);
		}
		logger.warn("OpenNLP POS initialized!");//System.out.println("OpenNLP POS initialized!");
		return this;

	}

	@Override
	public void reInit() throws ResourceInstantiationException {
		init();
	}

	/**
	 * @author georgiev
	 * @return MaxentModel
	 * @param String
	 *            path to MaxentModel
	 */
	public static MaxentModel getModel(File name) {
		try {
			return new BinaryGISModelReader(new DataInputStream(
					new GZIPInputStream(new FileInputStream(name)))).getModel();
		} catch (IOException E) {
			E.printStackTrace();
			return null;
		}
	}

	/* getters and setters for the PR */
	/* public members */
	
	
	
	public void setInputASName(String a) {
		inputASName = a;
	}

	public String getInputASName() {
		return inputASName;
	}/* getters and setters for the PR */

	public URL getModel() {
		return model;
	}

	public void setModel(URL model) {
		this.model = model;
	}

	public URL getDictionary() {
		return dictionary;
	}

	public void setDictionary(URL dictionary) {
		this.dictionary = dictionary;
	}

}