GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Gazetteer_Ontology

/*
 * OntoRootGaz.java
 * 
 * Copyright (c) 1998-2008, The University of Sheffield.
 * 
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * licenced under the GNU Library General Public License, Version 2, June 1991
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 */
package gate.clone.ql;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.ConsoleHandler;
import java.util.logging.FileHandler;
import java.util.logging.Formatter;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
import gate.Annotation;
import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.clone.ql.CATConstants;
import gate.clone.ql.FakeSentenceSplitter;
import gate.clone.ql.Ontology2MapManager;
import gate.clone.ql.regex.ExpressionFinder;
import gate.creole.ANNIEConstants;
import gate.creole.ExecutionException;
import gate.creole.POSTagger;
import gate.creole.ResourceInstantiationException;
import gate.creole.SerialAnalyserController;
import gate.creole.gazetteer.DefaultGazetteer;
import gate.creole.gazetteer.FSMState;
import gate.creole.gazetteer.LinearDefinition;
import gate.creole.gazetteer.Lookup;
import gate.creole.morph.Morph;
import gate.creole.ontology.*;
import gate.creole.tokeniser.DefaultTokeniser;
import gate.util.OffsetComparator;

/**
 * 
 * @author Valentin Tablan, Danica Damljanovic
 * 
 */
public class OntoRootGaz extends DefaultGazetteer {
  private static final long serialVersionUID = 0L;

  protected POSTagger posTagger;

  protected DefaultTokeniser tokeniser;

  protected FakeSentenceSplitter sentenceSplitter;

  protected Morph morpher;

  protected SerialAnalyserController rootFinderApplication;

  protected OffsetComparator offsetComparator;

  protected Ontology ontology;

  /**
   * should camelCased words be separated so that projectName becomes project
   * Name
   */
  protected Boolean separateCamelCasedWords;

  /**
   * should resource URI (usually called a fragment identifier - a set of
   * characters after / or #) be considered; for example, if there is a resource
   * with URI http://gate.ac.uk/ns/gate-ontology#POSTagger, should POSTagger be
   * considered or not
   */
  protected Boolean useResourceUri;

  /**
   * should properties be considered or not; NOTE: if this parameter is set to
   * false, than propertiesToInlcude and propertiesToExclude will be ignored
   */
  protected Boolean considerProperties;

  /**
   * a list of lookups that will be created after processing of all relevant
   * data
   */
  protected List<Lookup> allLookups = new ArrayList<Lookup>();

  protected Corpus applicationCorpus;

  /**
   * a map of roots: a key is a lookup.list value, e.g. 'projects', and the
   * value is a root of that key, in this case that would be 'project'
   */
  Map<String, String> listRoots = new HashMap<String, String>();

  /**
   * Should the rules be followed or not: if true then, few heuristic rules will
   * apply: the words containing spaces will be split; for example, if 'pos
   * tagger for spanish' would be analysed, 'for' would be considered a stop
   * word and heuristically derived would be 'pos tagger' and this would be
   * further used to add 'pos tagger' with heuristical level 0, and 'tagger'
   * with hl 1 to the gazetteer list; at runtime lower heuristical level should
   * be prefered
   */
  protected Boolean considerHeuristicRules;

  /**
   * comma separated values of property names that will be considered when
   * initializing the gazetteer
   */
  protected String propertiesToInclude;

  /**
   * comma separated values of property names that will be excluded when
   * initializing the gazetteer NOTE: setting propertiesToInclude to be
   * different from "" automatically means that all properties not in the list
   * will be excluded (in other words, if propertiesToInclude is set, it is not
   * necessary to set propertiesToExclude as all properties not listed in
   * propertiesToInclude will be excluded);
   */
  protected String propertiesToExclude;

  /*****************************************************************************
   * setting logger to log entries to the gazetteer list
   ****************************************************************************/
  private static Logger logger = null;
  static {
    logger = Logger.getLogger("OntoRootGaz");
    logger.setUseParentHandlers(false);
    File logDir = null;
    // find the top directory
    String classFileName = OntoRootGaz.class.getCanonicalName();
    classFileName = classFileName.replace('.', '/');
    classFileName = "/" + classFileName + ".class";
    URL classUrl = OntoRootGaz.class.getResource(classFileName);
    if(classUrl.getProtocol().equalsIgnoreCase("jar")) {
      String pathStr = classUrl.getPath();
      pathStr = pathStr.substring(0, pathStr.indexOf('!'));
      File jarFile = null;
      try {
        jarFile = new File(new URL(pathStr).getPath());
      } catch(MalformedURLException e) {
        e.printStackTrace();
      }
      File jarDir = new File(jarFile.getParent());
      logDir = new File(jarDir, "logs");
    }
    if(logDir != null) {
      if(!logDir.exists()) logDir.mkdirs();
      try {
        FileHandler logHandler =
                new FileHandler(logDir.getCanonicalPath() + "/"
                        + OntoRootGaz.class.getSimpleName() + "-%u.log", false);
        logHandler.setFormatter(new Formatter() {
          /**
           * This method format log record to show *message only*
           */
          public String format(LogRecord record) {
            return record.getMessage();
          }
        });
        logHandler.setLevel(Level.ALL);
        // add the new file handler for everything
        logger.addHandler(logHandler);
        // add the handler for Output messages
        ConsoleHandler outHandler = new ConsoleHandler();
        outHandler.setLevel(Level.parse(CATConstants.LOGGER_OUPUT_LEVEL));
        outHandler.setFormatter(new Formatter() {
          /**
           * This method format log record to show *message only*
           */
          public String format(LogRecord record) {
            return record.getMessage();
          }
        });
        logger.addHandler(outHandler);
      } catch(SecurityException e) {
        e.printStackTrace();
      } catch(IOException e) {
        e.printStackTrace();
      }
    }
    /***************************************************************************
     * end setting the logger
     **************************************************************************/
  }

  public void reInit() throws ResourceInstantiationException {
    this.init();
  }

  public Resource init() throws ResourceInstantiationException {
    
    //list of namespaces to be ignored when creating gazetteer list
    List<String> nsToIgnore = new ArrayList<String>();
    nsToIgnore.add("http://www.w3.org/2002/07/owl#");
    nsToIgnore.add("http://www.w3.org/2000/01/rdf-schema#");
    nsToIgnore.add("http://www.w3.org/1999/02/22-rdf-syntax-ns#");
    
    logger.info("--------------------------------------\n");
    logger.info(" Initializing gazetteer for ontology from location:\n");
    logger.info(ontology.getURL().toString());
    logger.info("--------------------------------------\n");
    long startedInit = System.currentTimeMillis();
    List<String> propertiesToIncludeList = new ArrayList<String>();
    List<String> propertiesToExcludeList = new ArrayList<String>();
    if(tokeniser == null)
      throw new ResourceInstantiationException("No tokeniser provided!");
    if(sentenceSplitter == null) {
      sentenceSplitter =
              (FakeSentenceSplitter)Factory
                      .createResource("gate.clone.ql.FakeSentenceSplitter");
    }
    if(posTagger == null)
      throw new ResourceInstantiationException(
              "No Part-of-speach Tagger provided!");
    if(morpher == null)
      throw new ResourceInstantiationException(
              "No Morphological Analyzer provided!");
    if(ontology == null) {
      throw new ResourceInstantiationException("No ontology provided!");
    } else {
      Ontology2MapManager.getInstance().addOntologyToIndex(ontology);
    }
    /* set default values if they are not set already */
    if(this.useResourceUri == null) useResourceUri = true;
    if(considerProperties == null) considerProperties = true;
    if(separateCamelCasedWords == null) separateCamelCasedWords = true;
    if(considerHeuristicRules == null) considerHeuristicRules = false;
    fsmStates = new HashSet();
    initialState = new FSMState(this);
    /* set the hidden feature to true */
    FeatureMap features = Factory.newFeatureMap();
    FeatureMap parameters = Factory.newFeatureMap();
    Gate.setHiddenAttribute(features, true);
    rootFinderApplication =
            (SerialAnalyserController)Factory.createResource(
                    "gate.creole.SerialAnalyserController", parameters,
                    features);
    rootFinderApplication.add(tokeniser);
    rootFinderApplication.add(sentenceSplitter);
    rootFinderApplication.add(posTagger);
    rootFinderApplication.add(morpher);
    /* create a corpus and hide it inside the GATE GUI */
    FeatureMap corpusParams = Factory.newFeatureMap();
    corpusParams.put("name", this.getClass().getCanonicalName());
    FeatureMap corpusFeatures = Factory.newFeatureMap();
    Gate.setHiddenAttribute(corpusFeatures, true);
    applicationCorpus =
            (Corpus)Factory.createResource("gate.corpora.CorpusImpl",
                    corpusParams, corpusFeatures);
    rootFinderApplication.setCorpus(applicationCorpus);
    offsetComparator = new OffsetComparator();
    /*
     * move properties to include and exclude from the list of CSV to the actual
     * List objects
     */
    if(considerProperties && propertiesToInclude != null
            && propertiesToExclude != null) {
      String[] listInclude = propertiesToInclude.split(",");
      for(String item : listInclude) {
        if(!"".equals(item.trim())) propertiesToIncludeList.add(item.trim());
      }
      String[] listExclude = propertiesToExclude.split(",");
      for(String item : listExclude) {
        if(!"".equals(item.trim())) propertiesToExcludeList.add(item.trim());
      }
    }
    /*
     * check validity: if a property is in both 'to be excluded' and 'to be
     * included' list throw an exception
     */
    if(propertiesToExcludeList.size() > 0 && propertiesToIncludeList.size() > 0) {
      for(String propertyUri : propertiesToExcludeList) {
        if(propertiesToIncludeList.contains(propertyUri))
          throw new ResourceInstantiationException(
                  "You specified that the same property should be both included and "
                          + "excluded!");
      }
    }
    if(considerProperties) {
      /*************************************************************************
       * instances with all set properties returned in a table with 3 columns:
       * ... instanceUri, propertyUri, propertyValue [new line] instanceUri,
       * propertyUri, propertyValue [new line] ...
       ************************************************************************/
      String[] rows =
              Ontology2MapManager.getInstance().getOntology2Map(
                      ontology.getURL().toString()).getListOfInstances().split(
                      CATConstants.NEW_LINE);
      for(String eachRow : rows) {
        String[] columns = eachRow.split("\\|");
        if(columns.length == 3) {
          String uri = columns[0].trim();
          try {
            /* create uriURI for validation purposes */
            URI uriUri = new URI(uri, false);
            String propUri = columns[1].trim();
            if((propertiesToIncludeList.size() == 0 || propertiesToIncludeList
                    .contains(propUri))
                    && (propertiesToExcludeList.size() == 0 || !(propertiesToExcludeList
                            .contains(propUri)))) {
              if(!nsToIgnore.contains(uriUri.getNameSpace())) {
                String propValue = columns[2].trim();
                Map<String, Object> lookupFeatures =
                        new HashMap<String, Object>();
                lookupFeatures.put(CATConstants.ONTORES_TYPE,
                        CATConstants.TYPE_INSTANCE);
                lookupFeatures.put(CATConstants.FEATURE_URI, uri);
                lookupFeatures.put(CATConstants.FEATURE_PROPERTY_URI, propUri);
                lookupFeatures.put(CATConstants.FEATURE_PROPERTY_VALUE,
                        propValue);
                lookupFeatures.put(CATConstants.CLASS_URI_LIST,
                        Ontology2MapManager.getInstance().getOntology2Map(
                                ontology.getURL().toString())
                                .getInstanceTypes().get(uri));
                lookupFeatures.put(CATConstants.CLASS_URI,
                        new ArrayList<String>(Ontology2MapManager.getInstance()
                                .getOntology2Map(ontology.getURL().toString())
                                .getInstanceTypes().get(uri)).get(0));
                Lookup aLookup = new Lookup(propValue, "", null, null);
                aLookup.features = lookupFeatures;
                allLookups.add(aLookup);
              }// if uri is in the list of ignored namespaces: nsToIgnore
            }// end if propertiesToIncludeList==0 ...
          } catch(InvalidURIException e) {
            logger.info("URI:'" + uri + "' is not valid. Skipping...\n");
          }
        }
      }
      /*************************************************************************
       * classes with all set properties returned in a table with 3 columns:
       * classUri, propertyUri, propertyValue
       * ************************************************************ *
       ************************************************************************/
      rows =
              Ontology2MapManager.getInstance().getOntology2Map(
                      ontology.getURL().toString()).getListOfClasses().split(
                      CATConstants.NEW_LINE);
      for(String eachRow : rows) {
        String[] columns = eachRow.split("\\|");
        if(columns.length == 3) {
          String uri = columns[0].trim();
          try {
            URI uriUri = new URI(uri, false);
            String propUri = columns[1].trim();
            if((propertiesToIncludeList.size() == 0 || propertiesToIncludeList
                    .contains(propUri))
                    && (propertiesToExcludeList.size() == 0 || !(propertiesToExcludeList
                            .contains(propUri)))) {
              if(!nsToIgnore.contains(uriUri.getNameSpace())) {
                String propValue = columns[2].trim();
                Map<String, Object> lookupFeatures =
                        new HashMap<String, Object>();
                lookupFeatures.put(CATConstants.ONTORES_TYPE,
                        CATConstants.TYPE_CLASS);
                lookupFeatures.put(CATConstants.FEATURE_URI, uri);
                lookupFeatures.put(CATConstants.FEATURE_PROPERTY_URI, propUri);
                Lookup aLookup = new Lookup(propValue, "", null, null);
                aLookup.features = lookupFeatures;
                allLookups.add(aLookup);
              }// end if propertiesToIncludeList==0 ...
            }// if uri is in the list of ignored namespaces: nsToIgnore
          } catch(InvalidURIException e) {
            logger.info("URI:'" + uri + "' is not valid.\n");
          }
        }
      }
      /*************************************************************************
       * properties with all set properties returned in a table with 3 columns:
       * propertyUri, setPropertyUri, propertyValue
       * ************************************************************ *
       ************************************************************************/
      rows =
              Ontology2MapManager.getInstance().getOntology2Map(
                      ontology.getURL().toString()).getListOfProperties()
                      .split(CATConstants.NEW_LINE);
      for(String eachRow : rows) {
        String[] columns = eachRow.split("\\|");
        if(columns.length == 3) {
          String uri = columns[0].trim();
          try {
            URI uriUri = new URI(uri, false);
            String propUri = columns[1].trim();
            if((propertiesToIncludeList.size() == 0 || propertiesToIncludeList
                    .contains(propUri))
                    && (propertiesToExcludeList.size() == 0 || !(propertiesToExcludeList
                            .contains(propUri)))) {
              if(!nsToIgnore.contains(uriUri.getNameSpace())) {
                String propValue = columns[2].trim();
                Map<String, Object> lookupFeatures =
                        new HashMap<String, Object>();
                lookupFeatures.put(CATConstants.ONTORES_TYPE,
                        CATConstants.TYPE_PROPERTY);
                lookupFeatures.put(CATConstants.FEATURE_URI, uri);
                lookupFeatures.put(CATConstants.FEATURE_PROPERTY_URI, propUri);
                lookupFeatures.put(CATConstants.FEATURE_PROPERTY_VALUE,
                        propValue);
                Lookup aLookup = new Lookup(propValue, "", null, null);
                aLookup.features = lookupFeatures;
                allLookups.add(aLookup);
              }// end if propertiesToIncludeList==0 ...
            }
          } catch(InvalidURIException e) {
            logger.info("URI:'" + uri + "' is not valid.\n");
          }
        }
      }
    }// end consider properties
    /* uri retrieval */
    if(useResourceUri) {
      /*************************************************************************
       * class uris
       ************************************************************************/
      String[] rows =
              Ontology2MapManager.getInstance().getOntology2Map(
                      ontology.getURL().toString()).getClassURIs().split(
                      CATConstants.NEW_LINE);
      for(String eachRow : rows) {
        String uri = eachRow.trim();
        try {
          URI uriUri = new URI(uri, false);
          String shortName = uriUri.getResourceName();
          if(!nsToIgnore.contains(uriUri.getNameSpace())) {
            Map<String, Object> lookupFeatures = new HashMap<String, Object>();
            lookupFeatures.put(CATConstants.ONTORES_TYPE,
                    CATConstants.TYPE_CLASS);
            lookupFeatures.put(CATConstants.FEATURE_URI, uri.trim());
            Lookup aLookup = new Lookup(shortName, "", null, null);
            aLookup.features = lookupFeatures;
            allLookups.add(aLookup);
          }
        } catch(InvalidURIException e) {
          logger.info("URI:" + uri + " is not valid.\n");
        }
      }
      /*************************************************************************
       * instance uris
       ************************************************************************/
      Set<String> setOfInstanceTypes =
              Ontology2MapManager.getInstance().getOntology2Map(
                      ontology.getURL().toString()).getInstanceTypes().keySet();
      for(String uri : setOfInstanceTypes) {
        try {
          URI uriUri = new URI(uri, false);
          String shortName = uriUri.getResourceName();
          if(!nsToIgnore.contains(uriUri.getNameSpace())) {
            Map<String, Object> lookupFeatures = new HashMap<String, Object>();
            lookupFeatures.put(CATConstants.ONTORES_TYPE,
                    CATConstants.TYPE_INSTANCE);
            lookupFeatures.put(CATConstants.FEATURE_URI, uri);
            Set<String> l =
                    Ontology2MapManager.getInstance().getOntology2Map(
                            ontology.getURL().toString()).getInstanceTypes()
                            .get(uri);
            lookupFeatures.put(CATConstants.CLASS_URI_LIST, l);
            lookupFeatures.put(CATConstants.CLASS_URI, new ArrayList<String>(l)
                    .get(0));
            Lookup aLookup = new Lookup(shortName, "", null, null);
            aLookup.features = lookupFeatures;
            allLookups.add(aLookup);
          }
        } catch(InvalidURIException e) {
          logger.info("URI:" + uri + " is not valid.\n");
        }
      }
      /*************************************************************************
       * property uris
       ************************************************************************/
      rows =
              Ontology2MapManager.getInstance().getOntology2Map(
                      ontology.getURL().toString()).getPropertyURIs().split(
                      CATConstants.NEW_LINE);
      for(String eachRow : rows) {
        String uri = eachRow.trim();
        try {
          URI uriUri = new URI(uri, false);
          String shortName = uriUri.getResourceName();
          if(!nsToIgnore.contains(uriUri.getNameSpace())) {
            Map<String, Object> lookupFeatures = new HashMap<String, Object>();
            lookupFeatures.put(CATConstants.ONTORES_TYPE,
                    CATConstants.TYPE_PROPERTY);
            lookupFeatures.put(CATConstants.FEATURE_URI, uri);
            Lookup aLookup = new Lookup(shortName, "", null, null);
            aLookup.features = lookupFeatures;
            allLookups.add(aLookup);
          }
        } catch(InvalidURIException e) {
          logger.info("URI:" + uri + " is not valid.\n");
        }
      }
    }
    addLookups(allLookups);
    allLookups = new ArrayList<Lookup>();
    /* release GATE resources */
    Factory.deleteResource(applicationCorpus);
    applicationCorpus = null;
    rootFinderApplication.remove(morpher);
    rootFinderApplication.remove(posTagger);
    rootFinderApplication.remove(sentenceSplitter);
    Factory.deleteResource(sentenceSplitter);
    sentenceSplitter = null;
    rootFinderApplication.remove(tokeniser);
    Factory.deleteResource(rootFinderApplication);
    rootFinderApplication = null;
    long currentTime = System.currentTimeMillis();
    logger.info("OntoRootGaz initialized for:" + (currentTime - startedInit)
            + " ms");
    return this;
  }

  /**
   * This method takes a list of lookups as a parameter, process them and
   * returns a list of new Lookups that are than added to the gazetteer.
   * 'Processing' means replacing lookup.list feature with its root.
   * Additionally during the processing a new list if Lookups is created called
   * additionalList: this list contains a new Lookups that needs to be processed
   * by calling this method again afterwards: - if lookup.list contains "-" or
   * "_", replace these chars by space, add new lookups to the additionalList
   * and then extract the root in the next call to this method - if
   * separateCamelCasedWords=true, separate them by adding a space, add new
   * lookups to the additionalList and then extract the root later - if
   * considerHeuristicRules=true then separate words as proposed by these rules,
   * add new lookups to the additionalList and then extract the root later
   * 
   * @param List
   *          <Lookup> lookups
   * @throws ResourceInstantiationException
   */
  protected void addLookups(List<Lookup> lookups)
          throws ResourceInstantiationException {
    List<Lookup> lookupsToBeAdded = runRootFinderApplication(lookups);
    List<Lookup> additionalListTemp = new ArrayList<Lookup>();
    additionalListTemp.addAll(additionalList);
    additionalList = new ArrayList<Lookup>();
    List<Lookup> addition = runRootFinderApplication(additionalListTemp);
    List<Lookup> all = new ArrayList<Lookup>();
    all.addAll(lookupsToBeAdded);
    all.addAll(addition);
    for(Lookup aLookup : all) {
      String root = listRoots.get(aLookup.list);
      int hLevel = 0;
      if(root != null) {
        /*
         * check if the root has spaces and if considerHeuristicRules is set to
         * true, if yes, than split words and add heuristical_level to each
         */
        if(root.contains(" ") && considerHeuristicRules == true) {
          Lookup aNewLookup = new Lookup(aLookup.list, "", null, null);
          Map<String, Object> newFeatures = new HashMap<String, Object>();
          for(Object key : aLookup.features.keySet()) {
            newFeatures.put((String)key, aLookup.features.get(key));
          }
          aNewLookup.features = newFeatures;
          aNewLookup.features.put(CATConstants.FEATURE_HEURISTIC_LEVEL, hLevel);
          aNewLookup.features.put(CATConstants.FEATURE_HEURISTIC_VALUE, root);
          addLookup(root.trim(), aNewLookup);
          logger.info("NEW ENTRY: " + root + "\n");
          int firstIndex = root.trim().indexOf(" ");
          String newRoot = root.trim();
          while(firstIndex >= 0) {
            newRoot = newRoot.substring(firstIndex + 1, newRoot.length());
            hLevel++;
            Lookup anotherLookup = new Lookup(aLookup.list, "", null, null);
            Map<String, Object> anotherFeatures = new HashMap<String, Object>();
            for(Object key : aLookup.features.keySet()) {
              anotherFeatures.put((String)key, aLookup.features.get(key));
            }
            anotherLookup.features = anotherFeatures;
            anotherLookup.features.put(CATConstants.FEATURE_HEURISTIC_LEVEL,
                    hLevel);
            anotherLookup.features.put(CATConstants.FEATURE_HEURISTIC_VALUE,
                    newRoot.trim());
            addLookup(newRoot.trim(), anotherLookup);
            logger.info("NEW ENTRY: " + newRoot + "\n");
            firstIndex = newRoot.trim().indexOf(" ");
          }
        } else {// if it doesn't have spaces or
          // considerHeuristicRules=false
          aLookup.features.put(CATConstants.FEATURE_HEURISTIC_LEVEL, 0);
          addLookup(root.trim(), aLookup);
          logger.info("NEW ENTRY: " + root + "\n");
        }
      } else {
        logger.info("root is null for lookup:" + aLookup);
      }
    }
  }

  /*
   * this list is populated during the processing of all lookups, when some
   * entries have multiple interpretations; for example, when processing
   * Project-Name, 'Project-Name' would be added in the first iteration, while
   * 'Project Name' would be added to the additionalList for later processing
   */
  List<Lookup> additionalList = new ArrayList<Lookup>();

  /**
   * This method process given lookups so that their entries are converted to
   * the root of the entry i.e. lookup.list is processed and 'root' feature is
   * used to be lookup.list for resulting lookups. All unprocessed lookups are
   * added to the additionalList and they are processed later with the same
   * method
   */
  private List<Lookup> runRootFinderApplication(List<Lookup> lookups)
          throws ResourceInstantiationException {
    List<Lookup> lookupsToBeReturned = new ArrayList<Lookup>();
    for(Lookup lookup : lookups) {
      String list = lookup.list;
      if(list != null && list.trim().length() > 0) {
        if(list.contains("_")) {
          String newText = list.replace('_', ' ');
          Lookup aLookup = new Lookup(newText, "", null, null);
          aLookup.features = lookup.features;
          additionalList.add(aLookup);
        }
        // if text is camel cased add space between words
        if(separateCamelCasedWords && list.indexOf(" ") < 0) {
          String separatedCamelCase =
                  ExpressionFinder.findAndSeparateCamelCases(list,
                          CATConstants.REGEX_CAMEL_CASE, " ");
          if(list != null && (!list.equals(separatedCamelCase))) {
            Lookup aLookup = new Lookup(separatedCamelCase, "", null, null);
            aLookup.features = lookup.features;
            additionalList.add(aLookup);
          }
        }
        lookupsToBeReturned.add(lookup);
        /* set new documents to be hidden inside the GATE GUI */
        FeatureMap docParams = Factory.newFeatureMap();
        docParams.put("stringContent", list);
        FeatureMap docFeatures = Factory.newFeatureMap();
        Gate.setHiddenAttribute(docFeatures, true);
        Document aDocument = null;
        try {
          aDocument =
                  (Document)Factory.createResource("gate.corpora.DocumentImpl",
                          docParams, docFeatures);
          applicationCorpus.add(aDocument);
          rootFinderApplication.execute();
        } catch(ExecutionException ee) {
          throw new ResourceInstantiationException(ee);
        }
        Iterator it = applicationCorpus.iterator();
        while(it.hasNext()) {
          Document doc = (Document)it.next();
          Set<String> tokenTypes = new HashSet<String>();
          tokenTypes.add(ANNIEConstants.TOKEN_ANNOTATION_TYPE);
          tokenTypes.add(ANNIEConstants.SPACE_TOKEN_ANNOTATION_TYPE);
          List<Annotation> tokenList =
                  new ArrayList<Annotation>(aDocument.getAnnotations().get(
                          tokenTypes));
          Collections.sort(tokenList, offsetComparator);
          StringBuffer rootForText = new StringBuffer("");
          boolean lastAnnWasSpace = false;
          for(Annotation ann : tokenList) {
            if(ann.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)) {
              lastAnnWasSpace = false;
              String category =
                      (String)ann.getFeatures().get(
                              ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
              /*
               * category "IN" means it is a preposition, and these are used to
               * be a stop words, so crop everything afterwards, but ONLY if
               * parameter considerHeuristicRules is set to be true
               */
              if(considerHeuristicRules == true && category.equals("IN")) {
                break;
              } else {
                String root = (String)ann.getFeatures().get("root");
                if(root != null) {
                  rootForText.append(root);
                } else {
                  throw new ResourceInstantiationException(
                          "No root found for annotation " + ann.toString());
                }
              }
            } else if(ann.getType().equals(
                    ANNIEConstants.SPACE_TOKEN_ANNOTATION_TYPE)) {
              if(!lastAnnWasSpace) {
                rootForText.append(' ');
              }
              lastAnnWasSpace = true;
            } else {
              // malfunction
              throw new ResourceInstantiationException(
                      "Invalid annotation type: " + ann);
            }
          }
          listRoots.put(doc.getContent().toString(), rootForText.toString());
        }
        applicationCorpus.clear();
        Factory.deleteResource(aDocument);
        aDocument = null;
      }
    }
    return lookupsToBeReturned;
  }

  public Morph getMorpher() {
    return morpher;
  }

  public void setMorpher(Morph morpher) {
    this.morpher = morpher;
  }

  public POSTagger getPosTagger() {
    return posTagger;
  }

  public void setPosTagger(POSTagger posTagger) {
    this.posTagger = posTagger;
  }

  public DefaultTokeniser getTokeniser() {
    return tokeniser;
  }

  public void setTokeniser(DefaultTokeniser tokeniser) {
    this.tokeniser = tokeniser;
  }

  public Ontology getOntology() {
    return ontology;
  }

  public void setOntology(Ontology ontology) {
    this.ontology = ontology;
  }

  public Boolean getConsiderProperties() {
    return considerProperties;
  }

  public void setConsiderProperties(Boolean considerProperties) {
    this.considerProperties = considerProperties;
  }

  public Boolean getUseResourceUri() {
    return useResourceUri;
  }

  public void setUseResourceUri(Boolean useResourceUri) {
    this.useResourceUri = useResourceUri;
  }

  /**
   * @return the separateCamelCasedWords
   */
  public Boolean getSeparateCamelCasedWords() {
    return separateCamelCasedWords;
  }

  /**
   * @param separateCamelCasedWords
   *          the separateCamelCasedWords to set
   */
  public void setSeparateCamelCasedWords(Boolean separateCamelCasedWords) {
    this.separateCamelCasedWords = separateCamelCasedWords;
  }

  /**
   * @return the propertiesToExclude
   */
  public String getPropertiesToExclude() {
    return propertiesToExclude;
  }

  /**
   * @param propertiesToExclude
   *          the propertiesToExclude to set
   */
  public void setPropertiesToExclude(String propertiesToExclude) {
    this.propertiesToExclude = propertiesToExclude;
  }

  /**
   * @return the propertiesToInclude
   */
  public String getPropertiesToInclude() {
    return propertiesToInclude;
  }

  /**
   * @param propertiesToInclude
   *          the propertiesToInclude to set
   */
  public void setPropertiesToInclude(String propertiesToInclude) {
    this.propertiesToInclude = propertiesToInclude;
  }

  /**
   * 
   * @return
   */
  public Boolean getConsiderHeuristicRules() {
    return considerHeuristicRules;
  }

  /**
   * 
   * @param considerHeuristicRules
   */
  public void setConsiderHeuristicRules(Boolean considerHeuristicRules) {
    this.considerHeuristicRules = considerHeuristicRules;
  }

  /**
   * Gets the linear definition of the gazetteer. This method is added so that
   * Gaze does not complain when rendering views and showing initialisation
   * parameters.
   * 
   * @return the linear definition of the gazetteer
   */
  public LinearDefinition getLinearDefinition() {
    return new LinearDefinition();
  }
}