GATE.ac.uk - releases/gate-7.0-build4195-ALL/plugins/OpenNLP/src/gate/opennlp/OpenNLPNameFin.java

package gate.opennlp;

import java.io.DataInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.log4j.Logger;

import opennlp.maxent.MaxentModel;
import opennlp.maxent.io.BinaryGISModelReader;
import opennlp.tools.namefind.NameFinderME;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;

/**
 * Wrapper for the opennlp namefinder
 * @author <A HREF="mailto:georgiev@ontotext.com>georgi.georgiev@ontotext.com</A>
 * Created: Thu Dec 11 16:25:59 EET 2008
 */

public @SuppressWarnings("all") class OpenNLPNameFin extends AbstractLanguageAnalyser {

	public static final long serialVersionUID = 1L;
	
	private static final Logger logger = Logger.getLogger(OpenNLPNameFin.class);
	
	

	// private members
	private String inputASName = null;
	private String outputASName = null;
	NameFinderME namefin = null;
	URL model;
	private List<NameFinder> finder;

	@Override
	public void execute() throws ExecutionException {
		// text doc annotations
		AnnotationSet annotations;
		if (inputASName != null && inputASName.length() > 0)
			annotations = document.getAnnotations(inputASName);
		else
			annotations = document.getAnnotations();

		AnnotationSet outputAnnots;
		if (outputASName != null && outputASName.length() > 0)
			outputAnnots = document.getAnnotations(outputASName);
		else
			outputAnnots = document.getAnnotations();

		// get sentence annotations
		//AnnotationSet sentences = document.getAnnotations("Sentence");

		// getdoc.get text
		String text = document.getContent().toString();
		
		//iterate on sentences
		
		gate.AnnotationSet sentences = annotations.get("Sentence");
		
		for (Annotation sentence : sentences) {
			
			//interate on tokens
			
			java.util.List<Annotation> tokenslist = new java.util.LinkedList<gate.Annotation>(annotations.get("Token", sentence.getStartNode().getOffset(), sentence.getEndNode().getOffset()));
			
			//sort them
			Collections.sort(tokenslist, new OffsetComparator());
			//make the string list
			String[] tokens = new String[tokenslist.size()];
			int k = 0;
			for (Annotation token : tokenslist) {
				tokens[k] = (String)token.getFeatures().get("string");
				k++;
				
			}
			// run tokenizer
//			Span[] spans = namefin.find(tokens);
			// compare the resulting
			// spans and add annotations
			
			for (Iterator iterator = finder.iterator(); iterator.hasNext();) {
				NameFinder tagger = (NameFinder) iterator.next();
				String[] off = tagger.model.find(tokens, Collections.EMPTY_MAP);
						for (int i = 0; i < off.length; i++) {
							if (!off[i].equals("other")) {
								// /here is the logic to add annotation to gate

								FeatureMap fm = Factory.newFeatureMap();
								// type
								fm.put("source", "openNLP");
								// ok then add to string buff all the string to make
								// shure we got all the span
								// fm.put("string", text.substring(spans[i].getStart(),
								// spans[i]
								// .getEnd()));
								// fm.put("string",
								// tokenslist.get(spans[i].getStart()+1).getStartNode().getOffset(),
								// tokenslist.get(spans[i].getEnd()).getEndNode().getOffset());
								// source
								fm.put("type", tagger.name);

								// iterate until got the final span
								int start = i;
								int tmp = i + 1;
								if (off.length - 1 > tmp) {
									// System.out.println("I am in if the value is: " +
									// (off.length > tmp) + " the arr is: " + off.length
									// + " " +
									// "the index is: " + tmp);
									try {
										while (off[i + 1].equals("cont")) {
											// while(off[++i].equals("cont"))
											tmp = i + 1;
											if (!(off.length - 1 > tmp))
												break;
											i++;
										}
									} catch (ArrayIndexOutOfBoundsException e) {
										logger.error("Problem in doc "
												+ document.getName());
										logger.error("The sentence is "
												+ sentence.toString());
										logger.error("The named entity is at position "
												+ tmp);
										continue;
									}
								}

								try {
									// annotations.add(Long.valueOf(tokens[start].get()),
									// Long
									// .valueOf(spans[i].getEnd()), "Name", fm);

									// get the label

									String label = document.getContent().getContent(
											tokenslist.get(start).getStartNode()
													.getOffset(),
											tokenslist.get(i).getEndNode().getOffset())
											.toString();

									// add the label
									fm.put("string", label);

									// get the class uri
									//String classUri = type2uri.get(tr.get(tagger.name));

									// put the class

									//fm.put("class", classUri);

									// get the inst
									/**
									 * we do not need the inst for now ask Phil or Joro
									 */
									// fm.put("inst",
									// util.Util.nextInstanceUri(classUri,
									// prefix, label));

									annotations.add(tokenslist.get(start)
											.getStartNode().getOffset(), tokenslist
											.get(i).getEndNode().getOffset(), tagger.name
											, fm);

								} catch (InvalidOffsetException e) {
									e.printStackTrace();
									throw new RuntimeException(e);
								}
						
					}
//						buf.append(in[i] + " ");
//						continue;
					}
				}

//			for (int i = 0; i < spans.length; i++) {
//
//
//
//				//startSpan = spans[i].getEnd() + countSpaces(spans[i].getEnd());
//			}
			
		}
		

	}

	@Override
	public Resource init() throws ResourceInstantiationException {
//		
//		logger.info("current path is: " + System.getProperty("user.dir"));
//		logger.info("current path is: " + System.getProperty("user.home"));
		// add a trailing slash to the model dir if necessary
		try {
			URL modelDir = model;
			if(!modelDir.toExternalForm().endsWith("/"))
				modelDir = new URL(modelDir.toExternalForm() + "/");
			List<String> models = new LinkedList<String>();
			models.add("person.bin.gz");
			models.add("money.bin.gz");
			models.add("time.bin.gz");
			models.add("percentage.bin.gz");
			models.add("location.bin.gz");
			models.add("date.bin.gz");
			models.add("organization.bin.gz");

			finder = new LinkedList<NameFinder>();
			for (Iterator iterator = models.iterator(); iterator.hasNext();) {
				String string = (String) iterator.next();
				logger.info("Initialized: "+ string);
				finder.add(new NameFinder(new URL(modelDir, string)));
			}
			
			logger.warn("OpenNLP Name Finder initialized!");
			
			return this;
		}
		catch(MalformedURLException e) {
			throw new ResourceInstantiationException(
				"OpenNLP Name Finder could not be initialized",
				e);
		}

	}

	@Override
	public void reInit() throws ResourceInstantiationException {
		init();
	}

	/**
	 * @author joro
	 * @return MaxentModel
	 * @param String
	 *            path to MaxentModel
	 */
	public static MaxentModel getModel(URL name) {
		try {
			return new BinaryGISModelReader(new DataInputStream(
					new GZIPInputStream(name.openStream()))).getModel();
		} catch (IOException E) {
			E.printStackTrace();
			logger.error("OpenNLP NameFinder can not be initialized!");
			throw new RuntimeException("OpenNLP NameFinder can not be initialized!", E);
		}
	}
	
	private class NameFinder {
		String name;
		NameFinderME model;

		public NameFinder(URL url) {
			String path = url.toExternalForm();
			try{
			name = path.substring(path.lastIndexOf("/") + 1, path.indexOf(".", path.lastIndexOf("/") + 1));
			}catch (Exception e){
				name = path;
			}
			model = new NameFinderME(OpenNLPNameFin.getModel(url));
		}

	}

	/* getters and setters for the PR */
	/* public members */
	public void setInputASName(String a) {
		inputASName = a;
	}

	public String getInputASName() {
		return inputASName;
	}

	public void setOutputASName(String a) {
		outputASName = a;
	}

	public String getOutputASName() {
		return outputASName;
	}

	public URL getModel() {
		return model;
	}

	public void setModel(URL model) {

		this.model = model;
	}/* getters and setters for the PR */

}