Log in Help
Print
HomegatepluginsTagger_Chemistrysrcmarkchemistry 〉 Tagger.java
 
/* **********************************************************************
 *           Chemistry Tagger - A GATE Processing Resource              *
 *         Copyright (C) 2004-2009 The University of Sheffield          *
 *       Developed by Mark Greenwood <m.greenwood@dcs.shef.ac.uk>       *
 *       Modifications by Ian Roberts <i.roberts@dcs.shef.ac.uk>        *
 *                                                                      *
 * This program is free software; you can redistribute it and/or modify *
 * it under the terms of the GNU Lesser General Public License as       *
 * published by the Free Software Foundation; either version 2.1 of the *
 * License, or (at your option) any later version.                      *
 *                                                                      *
 * This program is distributed in the hope that it will be useful,      *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of       *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
 * GNU General Public License for more details.                         *
 *                                                                      *
 * You should have received a copy of the GNU Lesser General Public     *
 * License along with this program; if not, write to the Free Software  *
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.            *
 ************************************************************************/
package mark.chemistry;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.net.URL;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.LanguageAnalyser;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.ResourceInstantiationException;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;

/**
 * A tagger for chemical elements and compounds.
 */
public class Tagger extends AbstractLanguageAnalyser implements
                                                    ProcessingResource,
                                                    Serializable {
  private LanguageAnalyser gazc = null;

  private LanguageAnalyser gazo = null;

  private LanguageAnalyser net = null;

  private String annotationSetName = null;

  // // Init parameters ////
  /**
   * The URL of the gazetteer lists definition for spotting elements as
   * part of compounds.
   */
  private URL compoundListsURL;

  public void setCompoundListsURL(URL newValue) {
    compoundListsURL = newValue;
  }

  public URL getCompoundListsURL() {
    return compoundListsURL;
  }

  public void setAnnotationSetName(String name) {
    annotationSetName = name;
  }

  public String getAnnotationSetName() {
    return annotationSetName;
  }

  /**
   * The URL of the gazetteer lists definition for spotting elements on
   * their own.
   */
  private URL elementListsURL;

  public void setElementListsURL(URL newValue) {
    elementListsURL = newValue;
  }

  public URL getElementListsURL() {
    return elementListsURL;
  }

  /**
   * URL of the JAPE grammar.
   */
  private URL transducerGrammarURL;

  public void setTransducerGrammarURL(URL newValue) {
    transducerGrammarURL = newValue;
  }

  public URL getTransducerGrammarURL() {
    return transducerGrammarURL;
  }

  private Boolean removeElements;

  public void setRemoveElements(Boolean newValue) {
    removeElements = newValue;
  }

  public Boolean getRemoveElements() {
    return removeElements;
  }

  private URL elementMapURL;

  public void setElementMapURL(URL newValue) {
    elementMapURL = newValue;
  }

  public URL getElementMapURL() {
    return elementMapURL;
  }

  private List<String> elementSymbol, elementName;

  /**
   * Create the tagger by creating the various gazetteers and JAPE
   * transducers it uses.
   */
  @Override
  public Resource init() throws ResourceInstantiationException {
    // sanity check parameters
    if(compoundListsURL == null) {
      throw new ResourceInstantiationException(
              "Compound lists URL must be specified");
    }
    if(elementListsURL == null) {
      throw new ResourceInstantiationException(
              "Element lists URL must be specified");
    }
    if(transducerGrammarURL == null) {
      throw new ResourceInstantiationException(
              "Transducer grammar URL must be specified");
    }
    elementSymbol = new ArrayList<String>();
    elementName = new ArrayList<String>();
    try {
      BufferedReader in = new BomStrippingInputStreamReader(
              elementMapURL.openStream());
      String symbol = in.readLine();
      while(symbol != null) {
        symbol = symbol.trim();
        String name = in.readLine().trim();
        elementSymbol.add(symbol);
        elementName.add(name.toLowerCase());
        symbol = in.readLine();
      }
    }
    catch(Exception e) {
      throw new ResourceInstantiationException("Malformed element map file");
    }
    FeatureMap hidden = Factory.newFeatureMap();
    Gate.setHiddenAttribute(hidden, true);

    FeatureMap params = Factory.newFeatureMap();
    params.put("listsURL", compoundListsURL);
    params.put("wholeWordsOnly", Boolean.FALSE);
    if(gazc == null) {
      gazc = (LanguageAnalyser)Factory.createResource(
              "gate.creole.gazetteer.DefaultGazetteer", params, hidden);
    }
    else {
      gazc.setParameterValues(params);
      gazc.reInit();
    }

    params = Factory.newFeatureMap();
    params.put("listsURL", elementListsURL);
    if(gazo == null) {
      gazo = (LanguageAnalyser)Factory.createResource(
              "gate.creole.gazetteer.DefaultGazetteer", params, hidden);
    }
    else {
      gazo.setParameterValues(params);
      gazo.reInit();
    }

    params = Factory.newFeatureMap();
    params.put("grammarURL", transducerGrammarURL);
    if(net == null) {
      net = (LanguageAnalyser)Factory.createResource("gate.creole.Transducer",
              params, hidden);
    }
    else {
      net.setParameterValues(params);
      net.reInit();
    }
    
    return this;
  }

  public void cleanup() {
    Factory.deleteResource(gazc);
    Factory.deleteResource(gazo);
    Factory.deleteResource(net);
  }

  @Override
  public void execute() throws ExecutionException {

    Document doc = getDocument();

    try {
      gazc.setDocument(doc);
      gazc.setParameterValue("annotationSetName", annotationSetName);

      gazo.setDocument(doc);
      gazo.setParameterValue("annotationSetName", annotationSetName);

      net.setDocument(doc);
      net.setParameterValue("inputASName", annotationSetName);
      net.setParameterValue("outputASName", annotationSetName);
    }
    catch(ResourceInstantiationException rie) {
      throw new ExecutionException(rie);
    }

    try {
      gazc.execute();
      gazo.execute();
      net.execute();
      // This lot used to be in the clean.jape file but it was slowing
      // things down a lot as what I really wanted would have required
      // the brill style to do what it is meant to do.

      AnnotationSet docAS = doc.getAnnotations(annotationSetName);

      FeatureMap params = Factory.newFeatureMap();
      AnnotationSet temp = docAS.get("NotACompound", params);
      if(temp != null) docAS.removeAll(temp);
      params.put("majorType", "CTelement");
      temp = docAS.get("Lookup", params);
      if(temp != null) docAS.removeAll(temp);
      params.put("majorType", "chemTaggerSymbols");
      temp = docAS.get("Lookup", params);
      if(temp != null) docAS.removeAll(temp);
      if(removeElements.booleanValue()) {
        params = Factory.newFeatureMap();
        AnnotationSet compounds = docAS.get("ChemicalCompound", params);
        if(compounds != null) {
          Iterator<Annotation> cit = compounds.iterator();
          while(cit.hasNext()) {
            Annotation compound = cit.next();
            AnnotationSet elements = docAS.get("ChemicalElement", compound
                    .getStartNode().getOffset(), compound.getEndNode()
                    .getOffset());
            if(elements != null) {
              docAS.removeAll(elements);
            }
          }
        }
      }
      params = Factory.newFeatureMap();
      AnnotationSet elements = docAS.get("ChemicalElement", params);
      if(elements != null) {
        Iterator<Annotation> eit = elements.iterator();
        while(eit.hasNext()) {
          Annotation element = eit.next();
          try {
            String span = doc
                    .getContent()
                    .getContent(element.getStartNode().getOffset(),
                            element.getEndNode().getOffset()).toString();
            FeatureMap feats = element.getFeatures();
            String type = (String)feats.get("kind");
            if(type.equalsIgnoreCase("symbol")) {
              feats.put("symbol", span);
              int index = elementSymbol.indexOf(span);
              if(index != -1) {
                feats.put("name", elementName.get(index));
              }
              feats.put("uri",
                      "http://www.daml.org/2003/01/periodictable/PeriodicTable.owl#"
                              + span);
            }
            else if(type.equalsIgnoreCase("name")) {
              feats.put("name", span);
              int index = elementName.indexOf(span.toLowerCase());
              if(index != -1) {
                String symbol = elementSymbol.get(index);
                feats.put("symbol", symbol);
                feats.put("uri",
                        "http://www.daml.org/2003/01/periodictable/PeriodicTable.owl#"
                                + symbol);
              }
            }
          }
          catch(InvalidOffsetException ioe) {
          }
        }
      }
    }
    finally {
      // make sure document references are released after use
      gazc.setDocument(null);
      gazo.setDocument(null);
      net.setDocument(null);
    }
  }
}