/* ********************************************************************** * Chemistry Tagger - A GATE Processing Resource * * Copyright (C) 2004-2009 The University of Sheffield * * Developed by Mark Greenwood <m.greenwood@dcs.shef.ac.uk> * * Modifications by Ian Roberts <i.roberts@dcs.shef.ac.uk> * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as * * published by the Free Software Foundation; either version 2.1 of the * * License, or (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public * * License along with this program; if not, write to the Free Software * * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * ************************************************************************/ package mark.chemistry; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.Serializable; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.net.URL; import gate.Annotation; import gate.AnnotationSet; import gate.Document; import gate.Factory; import gate.FeatureMap; import gate.Gate; import gate.LanguageAnalyser; import gate.ProcessingResource; import gate.Resource; import gate.creole.ResourceInstantiationException; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.util.BomStrippingInputStreamReader; import gate.util.InvalidOffsetException; /** * A tagger for chemical elements and compounds. */ public class Tagger extends AbstractLanguageAnalyser implements ProcessingResource, Serializable { private LanguageAnalyser gazc = null; private LanguageAnalyser gazo = null; private LanguageAnalyser net = null; private String annotationSetName = null; // // Init parameters //// /** * The URL of the gazetteer lists definition for spotting elements as * part of compounds. */ private URL compoundListsURL; public void setCompoundListsURL(URL newValue) { compoundListsURL = newValue; } public URL getCompoundListsURL() { return compoundListsURL; } public void setAnnotationSetName(String name) { annotationSetName = name; } public String getAnnotationSetName() { return annotationSetName; } /** * The URL of the gazetteer lists definition for spotting elements on * their own. */ private URL elementListsURL; public void setElementListsURL(URL newValue) { elementListsURL = newValue; } public URL getElementListsURL() { return elementListsURL; } /** * URL of the JAPE grammar. */ private URL transducerGrammarURL; public void setTransducerGrammarURL(URL newValue) { transducerGrammarURL = newValue; } public URL getTransducerGrammarURL() { return transducerGrammarURL; } private Boolean removeElements; public void setRemoveElements(Boolean newValue) { removeElements = newValue; } public Boolean getRemoveElements() { return removeElements; } private URL elementMapURL; public void setElementMapURL(URL newValue) { elementMapURL = newValue; } public URL getElementMapURL() { return elementMapURL; } private List<String> elementSymbol, elementName; /** * Create the tagger by creating the various gazetteers and JAPE * transducers it uses. */ @Override public Resource init() throws ResourceInstantiationException { // sanity check parameters if(compoundListsURL == null) { throw new ResourceInstantiationException( "Compound lists URL must be specified"); } if(elementListsURL == null) { throw new ResourceInstantiationException( "Element lists URL must be specified"); } if(transducerGrammarURL == null) { throw new ResourceInstantiationException( "Transducer grammar URL must be specified"); } elementSymbol = new ArrayList<String>(); elementName = new ArrayList<String>(); try { BufferedReader in = new BomStrippingInputStreamReader( elementMapURL.openStream()); String symbol = in.readLine(); while(symbol != null) { symbol = symbol.trim(); String name = in.readLine().trim(); elementSymbol.add(symbol); elementName.add(name.toLowerCase()); symbol = in.readLine(); } } catch(Exception e) { throw new ResourceInstantiationException("Malformed element map file"); } FeatureMap hidden = Factory.newFeatureMap(); Gate.setHiddenAttribute(hidden, true); FeatureMap params = Factory.newFeatureMap(); params.put("listsURL", compoundListsURL); params.put("wholeWordsOnly", Boolean.FALSE); if(gazc == null) { gazc = (LanguageAnalyser)Factory.createResource( "gate.creole.gazetteer.DefaultGazetteer", params, hidden); } else { gazc.setParameterValues(params); gazc.reInit(); } params = Factory.newFeatureMap(); params.put("listsURL", elementListsURL); if(gazo == null) { gazo = (LanguageAnalyser)Factory.createResource( "gate.creole.gazetteer.DefaultGazetteer", params, hidden); } else { gazo.setParameterValues(params); gazo.reInit(); } params = Factory.newFeatureMap(); params.put("grammarURL", transducerGrammarURL); if(net == null) { net = (LanguageAnalyser)Factory.createResource("gate.creole.Transducer", params, hidden); } else { net.setParameterValues(params); net.reInit(); } return this; } public void cleanup() { Factory.deleteResource(gazc); Factory.deleteResource(gazo); Factory.deleteResource(net); } @Override public void execute() throws ExecutionException { Document doc = getDocument(); try { gazc.setDocument(doc); gazc.setParameterValue("annotationSetName", annotationSetName); gazo.setDocument(doc); gazo.setParameterValue("annotationSetName", annotationSetName); net.setDocument(doc); net.setParameterValue("inputASName", annotationSetName); net.setParameterValue("outputASName", annotationSetName); } catch(ResourceInstantiationException rie) { throw new ExecutionException(rie); } try { gazc.execute(); gazo.execute(); net.execute(); // This lot used to be in the clean.jape file but it was slowing // things down a lot as what I really wanted would have required // the brill style to do what it is meant to do. AnnotationSet docAS = doc.getAnnotations(annotationSetName); FeatureMap params = Factory.newFeatureMap(); AnnotationSet temp = docAS.get("NotACompound", params); if(temp != null) docAS.removeAll(temp); params.put("majorType", "CTelement"); temp = docAS.get("Lookup", params); if(temp != null) docAS.removeAll(temp); params.put("majorType", "chemTaggerSymbols"); temp = docAS.get("Lookup", params); if(temp != null) docAS.removeAll(temp); if(removeElements.booleanValue()) { params = Factory.newFeatureMap(); AnnotationSet compounds = docAS.get("ChemicalCompound", params); if(compounds != null) { Iterator<Annotation> cit = compounds.iterator(); while(cit.hasNext()) { Annotation compound = cit.next(); AnnotationSet elements = docAS.get("ChemicalElement", compound .getStartNode().getOffset(), compound.getEndNode() .getOffset()); if(elements != null) { docAS.removeAll(elements); } } } } params = Factory.newFeatureMap(); AnnotationSet elements = docAS.get("ChemicalElement", params); if(elements != null) { Iterator<Annotation> eit = elements.iterator(); while(eit.hasNext()) { Annotation element = eit.next(); try { String span = doc .getContent() .getContent(element.getStartNode().getOffset(), element.getEndNode().getOffset()).toString(); FeatureMap feats = element.getFeatures(); String type = (String)feats.get("kind"); if(type.equalsIgnoreCase("symbol")) { feats.put("symbol", span); int index = elementSymbol.indexOf(span); if(index != -1) { feats.put("name", elementName.get(index)); } feats.put("uri", "http://www.daml.org/2003/01/periodictable/PeriodicTable.owl#" + span); } else if(type.equalsIgnoreCase("name")) { feats.put("name", span); int index = elementName.indexOf(span.toLowerCase()); if(index != -1) { String symbol = elementSymbol.get(index); feats.put("symbol", symbol); feats.put("uri", "http://www.daml.org/2003/01/periodictable/PeriodicTable.owl#" + symbol); } } } catch(InvalidOffsetException ioe) { } } } } finally { // make sure document references are released after use gazc.setDocument(null); gazo.setDocument(null); net.setDocument(null); } } }