package gate.opennlp; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.zip.GZIPInputStream; import org.apache.log4j.Logger; import opennlp.maxent.MaxentModel; import opennlp.maxent.io.BinaryGISModelReader; import opennlp.tools.postag.POSDictionary; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.util.Span; import gate.Annotation; import gate.AnnotationSet; import gate.Factory; import gate.FeatureMap; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.util.InvalidOffsetException; /** * Wrapper for the open nlp pos tagger * @author <A HREF="mailto:georgiev@ontotext.com>georgi.georgiev@ontotext.com</A> * Created: Thu Dec 11 16:25:59 EET 2008 */ public @SuppressWarnings("all") class OpenNlpPOS extends AbstractLanguageAnalyser { public static final long serialVersionUID = 1L; private static final Logger logger = Logger.getLogger(OpenNlpPOS.class); // private members private String inputASName = null; POSTaggerME pos = null; URL model; URL dictionary; @Override public void execute() throws ExecutionException { // text doc annotations AnnotationSet annotations; if (inputASName != null && inputASName.length() > 0) annotations = document.getAnnotations(inputASName); else annotations = document.getAnnotations(); // getdoc.get text String text = document.getContent().toString(); // get sentence annotations AnnotationSet sentences = document.getAnnotations().get("Sentence"); //order sentences List<Annotation> sentList = new LinkedList<Annotation>(); for (Iterator iterator = sentences.iterator(); iterator.hasNext();) { sentList.add( (Annotation) iterator.next()); } java.util.Collections.sort(sentList, new gate.util.OffsetComparator()); // for each sentence get token annotations for (Iterator iterator = sentList.iterator(); iterator.hasNext();) { Annotation annotation = (Annotation) iterator.next(); AnnotationSet sentenceTokens = document.getAnnotations().get("Token", annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()); //create a list List<Annotation> tokenList = new LinkedList<Annotation>(); for (Iterator iterator2 = sentenceTokens.iterator(); iterator2 .hasNext();) { tokenList.add((Annotation) iterator2.next()); } //order on offset Collections.sort(tokenList, new gate.util.OffsetComparator()); //make the array be string[] sentence String[] sentence = new String[tokenList.size()]; int i = 0; for (Iterator iterator2 = tokenList.iterator(); iterator2 .hasNext();) { Annotation token = (Annotation) iterator2.next(); sentence[i] = token.getFeatures().get("string").toString().replaceAll("\\s+", "").trim(); i++; } StringBuffer buf = new StringBuffer(); for (int j = 0; j < sentence.length; j++) { buf.append(sentence[j]+ "@@@"); } //run pos tagger String[] postags = null; /** * we will make shure to not * allow smth to breack the tagger */ try{ postags = pos.tag(sentence); }catch (Exception e){ e.printStackTrace(); System.out.println("There is a problem....\n with this sentence"); System.out.println(buf); continue; } //add tohose spans to token annotations int j = 0; for (Iterator iterator2 = tokenList.iterator(); iterator2 .hasNext();) { Annotation token = (Annotation) iterator2.next(); FeatureMap fm = token.getFeatures(); fm.put("category", postags[j]); token.setFeatures(fm); j++; } } } @Override public Resource init() throws ResourceInstantiationException { // logger.warn("OpenNLP POS initializing strings are: model - " + model.getFile() + // " dictionary: "+dictionary.getFile()); try { String file = null; String lexicon = null; if (model == null||dictionary==null){ file = "plugins/openNLP/models/english/postag/EnglishPOS.bin.gz"; lexicon = "plugins/openNLP/models/english/postag/tagdict"; } else{ file = model.getFile(); lexicon = dictionary.getFile(); } pos = new POSTaggerME(getModel(new File(file)), new POSDictionary( lexicon)); } catch (IOException e) { e.printStackTrace(); logger.error("OpenNLP POS can not be initialized!"); throw new RuntimeException("OpenNLP POS can not be initialized!", e); } logger.warn("OpenNLP POS initialized!");//System.out.println("OpenNLP POS initialized!"); return this; } @Override public void reInit() throws ResourceInstantiationException { init(); } /** * @author georgiev * @return MaxentModel * @param String * path to MaxentModel */ public static MaxentModel getModel(File name) { try { return new BinaryGISModelReader(new DataInputStream( new GZIPInputStream(new FileInputStream(name)))).getModel(); } catch (IOException E) { E.printStackTrace(); return null; } } /* getters and setters for the PR */ /* public members */ public void setInputASName(String a) { inputASName = a; } public String getInputASName() { return inputASName; }/* getters and setters for the PR */ public URL getModel() { return model; } public void setModel(URL model) { this.model = model; } public URL getDictionary() { return dictionary; } public void setDictionary(URL dictionary) { this.dictionary = dictionary; } }