/* * StandAloneAnnie.java * * * Copyright (c) 2000-2001, The University of Sheffield. * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June1991. * * A copy of this licence is included in the distribution in the file * licence.html, and is also available at http://gate.ac.uk/gate/licence.html. * * hamish, 29/1/2002 * * $Id: StandAloneAnnie.java,v 1.6 2006/01/09 16:43:22 ian Exp $ */ package sheffield.examples; import java.util.*; import java.io.*; import java.net.*; import gate.*; import gate.creole.*; import gate.util.*; import gate.util.persistence.PersistenceManager; import gate.corpora.RepositioningInfo; /** * This class illustrates how to use ANNIE as a sausage machine * in another application - put ingredients in one end (URLs pointing * to documents) and get sausages (e.g. Named Entities) out the * other end. * <P><B>NOTE:</B><BR> * For simplicity's sake, we don't do any exception handling. */ public class StandAloneAnnie { /** The Corpus Pipeline application to contain ANNIE */ private CorpusController annieController; /** * Initialise the ANNIE system. This creates a "corpus pipeline" * application that can be used to run sets of documents through * the extraction system. */ public void initAnnie() throws GateException, IOException { Out.prln("Initialising ANNIE..."); // load the ANNIE application from the saved state in plugins/ANNIE File pluginsHome = Gate.getPluginsHome(); File anniePlugin = new File(pluginsHome, "ANNIE"); File annieGapp = new File(anniePlugin, "ANNIE_with_defaults.gapp"); annieController = (CorpusController) PersistenceManager.loadObjectFromFile(annieGapp); Out.prln("...ANNIE loaded"); } // initAnnie() /** Tell ANNIE's controller about the corpus you want to run on */ public void setCorpus(Corpus corpus) { annieController.setCorpus(corpus); } // setCorpus /** Run ANNIE */ public void execute() throws GateException { Out.prln("Running ANNIE..."); annieController.execute(); Out.prln("...ANNIE complete"); } // execute() /** * Run from the command-line, with a list of URLs as argument. * <P><B>NOTE:</B><BR> * This code will run with all the documents in memory - if you * want to unload each from memory after use, add code to store * the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); for(int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while(iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '"+file.getAbsolutePath()+"'"); if(originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while(it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); Out.prln("Sorted annotations count: "+sortedAnnotations.size()); for(int i=sortedAnnotations.size()-1; i>=0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if(insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int)insertPositionEnd, endTag); editableContent.insert((int)insertPositionStart, startTagPart_3); editableContent.insert((int)insertPositionStart, currAnnot.getType()); editableContent.insert((int)insertPositionStart, startTagPart_2); editableContent.insert((int)insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int)insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while(it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); Out.prln("Sorted annotations count: "+sortedAnnotations.size()); for(int i=sortedAnnotations.size()-1; i>=0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if(insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int)insertPositionEnd, endTag); editableContent.insert((int)insertPositionStart, startTagPart_3); editableContent.insert((int)insertPositionStart, currAnnot.getType()); editableContent.insert((int)insertPositionStart, startTagPart_2); editableContent.insert((int)insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int)insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : "+originalContent); Out.prln("Repositioning: "+info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); } // for each doc } // main /** * */ public static class SortedAnnotationList extends Vector { public SortedAnnotationList() { super(); } // SortedAnnotationList public boolean addSortedExclusive(Annotation annot) { Annotation currAnot = null; // overlapping check for (int i=0; i<size(); ++i) { currAnot = (Annotation) get(i); if(annot.overlaps(currAnot)) { return false; } // if } // for long annotStart = annot.getStartNode().getOffset().longValue(); long currStart; // insert for (int i=0; i < size(); ++i) { currAnot = (Annotation) get(i); currStart = currAnot.getStartNode().getOffset().longValue(); if(annotStart < currStart) { insertElementAt(annot, i); /* Out.prln("Insert start: "+annotStart+" at position: "+i+" size="+size()); Out.prln("Current start: "+currStart); */ return true; } // if } // for int size = size(); insertElementAt(annot, size); //Out.prln("Insert start: "+annotStart+" at size position: "+size); return true; } // addSorted } // SortedAnnotationList } // class StandAloneAnnie