GATE.ac.uk - gate/plugins/Lang_Welsh/src/wnlt/morph/WelshMorph.java

/*
 *  WelshMorph.java
 *  This file is part of Welsh Natural Language Toolkit (WNLT)
 *  (see http://gate.ac.uk/), and is free software, licenced under 
 *  the GNU Library General Public License, Version 2, June 1991
 *  
 */

package wnlt.morph;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.Factory.DuplicationContext;
import gate.FeatureMap;
import gate.Gate;
import gate.ProcessingResource;
import gate.Resource;
import gate.Utils;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.CustomDuplication;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.Transducer;
import gate.creole.gazetteer.DefaultGazetteer;
import gate.creole.gazetteer.FlexibleGazetteer;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.event.ProgressListener;
import gate.event.StatusListener;
import gate.util.Benchmark;
import gate.util.Benchmarkable;
import gate.util.GateRuntimeException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import wnlt.LexiconCY;



/**
 * Description: This class is a wrapper for {@link wnlt.morph.Interpret},
 * the Morphological Analyzer. The class is based on and modifies for the 
 * purposes of the Welsh Natural Language Toolkit the Morph class
 * of the GATE Morphological analyser. 
 * 
 * @author Andreas Vlachidis 20/03/2016
 * 
 */

@CreoleResource(name = "Welsh Morphological Analyser",
        comment = "Morphological Analyzer of the Welsh Natural Language Toolkit", icon="welsh_lemmatiser.png")
public class WelshMorph
    extends AbstractLanguageAnalyser
    implements ProcessingResource, CustomDuplication, Benchmarkable {

  // note that this package could probably be simplified as the only modified
  // class from the original is Interpret so we may be able to just use the
  // existing classes, but for safety we currently use an entire copy, although
  // this may make bug fixing harder in the future
  
  
  private static final long serialVersionUID = 6964689654685956128L;

  /** File which contains rules to be processed */
  protected URL rulesFile;

  /** Instance of BaseWord class - Welsh Morpher */
  protected Interpret interpret;

  /** Feature Name that should be displayed for the root word */
  protected String rootFeatureName;

  /** Feature Name that should be displayed for the affix */
  protected String affixFeatureName;

  /** The name of the annotation set used for input */
  protected String annotationSetName;

  /** Boolean value that tells if parser should behave in caseSensitive mode */
  protected Boolean caseSensitive;
  
  /** Boolean value that checks if the required Part of Speech input is available  */
  protected Boolean considerPOSTag;
  
  /**
   * If this Morph PR is a duplicate of an existing PR, this property
   * will hold a reference to the original PR's Interpret instance.
   */
  protected Interpret existingInterpret;
  
  /** Lexicon of lemmas : read from an external file */ 
  protected LexiconCY lexicon;
  
  /** Path to the lexicon containing word lemmas*/
  public static final String
  TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
  
  /** Encoding of lexicon and gazetteer lists*/
  public static final String
  TAG_ENCODING_PARAMETER_NAME = "encoding";
  
  /** Post process transducer and gazeteer*/
  public static final String
  SPLIT_DOCUMENT_PARAMETER_NAME = "document";
  
  /** Name of input Annotation Set*/
  public static final String
  SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
  
  /** Name of output Annotation Set*/
  public static final String
  SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
  
  /** Path to gazetteer lists file*/
  public static final String
  SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL";
  
  /** Path to Post-processing JAPE transducer for mutation behaviour */
  public static final String
  SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL";
  
  /** Path to validation JAPE transducer of proposed mutations*/
  public static final String
  MORPH_VALIDTRANSD_URL_PARAMETER_NAME = "validationTransducerURL";
	
  @RunTime
  @Optional
  @CreoleParameter(
    comment = "Throw and exception when there are none of the required input annotations",
    defaultValue = "true")  
  public void setFailOnMissingInputAnnotations(Boolean fail) {
    failOnMissingInputAnnotations = fail;
  }
  public Boolean getFailOnMissingInputAnnotations() {
    return failOnMissingInputAnnotations;
  }
  protected Boolean failOnMissingInputAnnotations = false;
  
  protected Logger logger = Logger.getLogger(this.getClass().getName());  
  
  /** Default Constructor */
  public WelshMorph() {
  }

  /**
   * This method creates the instance of the BaseWord - Welsh Morpher and
   * returns the instance of current class with different attributes and
   * the instance of BaseWord class wrapped into it. 
   * The method also instantiates the post-processing transducer,    
   * and mutation validation gazetteer and transducer  
   * @return Resource
   * @throws ResourceInstantiationException
   */
  @Override
  public Resource init() throws ResourceInstantiationException {
    interpret = new Interpret();
    if(existingInterpret != null) {
      interpret.init(existingInterpret);
    }
    else {
      if (rulesFile == null) {
        // no rule file is there, simply run the interpret to interpret it and
        throw new ResourceInstantiationException("\n\n No Rule File Provided");
      }
  
      fireStatusChanged("Reading Rule File...");
      // compile the rules
      interpret.init(rulesFile, getLexiconURL(), getEncoding());
      fireStatusChanged("Morpher created!");
      fireProcessFinished();
    }
    
    //create Transducer and Gazetteer
    FeatureMap params;
    FeatureMap features;

    params = Factory.newFeatureMap();
    if(gazetteerListsURL != null)
      params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME,
              gazetteerListsURL);
    params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding);
    params.put(DefaultGazetteer.DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME, "false");
     

    if (gazetteer == null) {
      //gazetteer
      fireStatusChanged("Creating the gazetteer");
      features = Factory.newFeatureMap();
      Gate.setHiddenAttribute(features, true);

      gazetteer = (DefaultGazetteer)Factory.createResource(
              "gate.creole.gazetteer.DefaultGazetteer",
              params, features);
      gazetteer.setName("Gazetteer " + System.currentTimeMillis());
    }
    else {
      gazetteer.setParameterValues(params);
      gazetteer.reInit();
    }
    
  //create a flexible gazetteer
   params = Factory.newFeatureMap();
   List<String> inputFeatures = new ArrayList<String>();
   String rootFeature = "Token." + this.getRootFeatureName();
   inputFeatures.add(rootFeature);
   //Token.altLemma is used for additional cases of mutation e.g (f) from (m or b)  
   inputFeatures.add("Token.altLemma");
   params.put("inputFeatureNames", inputFeatures);
   params.put("gazetteerInst",gazetteer);
   if (flexigazetteer == null) {
	   	fireStatusChanged("Creating the Flexible Gazetteer");
	   	features = Factory.newFeatureMap();
	   	Gate.setHiddenAttribute(features, true);
    	flexigazetteer = (FlexibleGazetteer) Factory.createResource(
                "gate.creole.gazetteer.FlexibleGazetteer", params, features);
    	flexigazetteer.setName("FlexibleGazetteer " + System.currentTimeMillis());
    }
   else {
	   flexigazetteer.setParameterValues(params);
	   flexigazetteer.reInit();
   }
    
    fireProgressChanged(10);

    params = Factory.newFeatureMap();
    if(transducerURL != null)
      params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL);
    params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);

    if (transducer == null) {
      //transducer
      fireStatusChanged("Creating the JAPE transducer");
      features = Factory.newFeatureMap();
      Gate.setHiddenAttribute(features, true);

      transducer = (Transducer)Factory.createResource(
              "gate.creole.Transducer",
              params, features);
      transducer.setName("Transducer " + System.currentTimeMillis());
    }
    else {
      transducer.setParameterValues(params);
      transducer.reInit();
    }
    
    params = Factory.newFeatureMap();
    if(transducerURL != null)
      params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, validationTransducerURL);
      params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
    
    if (validator == null) {
        //transducer
        fireStatusChanged("Creating the JAPE validator transducer");
        features = Factory.newFeatureMap();
        Gate.setHiddenAttribute(features, true);

        validator = (Transducer)Factory.createResource(
                "gate.creole.Transducer",
                params, features);
        validator.setName("Transducer validator " + System.currentTimeMillis());
      }
      else {
    	  validator.setParameterValues(params);
    	  validator.reInit();
      }
    
    fireProgressChanged(100);
    fireProcessFinished();
      
    return this;
  }

  /**
   * Method is executed after the init() method has finished its execution.
   * <BR>Method does the following operations:
   * <OL type="1">
   * <LI> creates the annotationSet
   * <LI> fetches word tokens from the document, one at a time
   * <LI> runs the morpher on each individual word token
   * <LI> finds the root and the affix for that word
   * <LI> adds them as features to the current token
   * @throws ExecutionException
   */
  
  @Override
  public void cleanup() {
    Factory.deleteResource(transducer);
  }
  
  @Override
  public void execute() throws ExecutionException {
    // lets start the progress and initialize the progress counter
    fireProgressChanged(0);

    // If no document provided to process throw an exception
    if (document == null) {
      fireProcessFinished();
      throw new GateRuntimeException("No document to process!");
    }

    // get the annotationSet name provided by the user, or otherwise use the
    // default method
    AnnotationSet inputAs = (annotationSetName == null ||
        annotationSetName.length() == 0) ?
        document.getAnnotations() :
        document.getAnnotations(annotationSetName);

    // Morpher requires tokenizer to be run before running the Morpher
    // Fetch tokens from the document
    AnnotationSet tokens = inputAs.get(TOKEN_ANNOTATION_TYPE);
    if (tokens == null || tokens.isEmpty()) {
      fireProcessFinished();
      if(failOnMissingInputAnnotations) {
        throw new ExecutionException("Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher");
      } else {
        Utils.logOnce(logger,Level.INFO,"Morphological analyser: either a document does not have any contents or run the POS Tagger first - see debug log for details.");
        logger.debug("No input annotations in document "+document.getName());
        return;
      }
      //javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher"); ;
      //return;
    }

    // create iterator to get access to each and every individual token
    Iterator<Annotation> tokensIter = tokens.iterator();

    // variables used to keep track on progress
    int tokenSize = tokens.size();
    int tokensProcessed = 0;
    int lastReport = 0;

    //lets process each token one at a time
    while (tokensIter != null && tokensIter.hasNext()) {
      Annotation currentToken = tokensIter.next();
      String tokenValue = (String) (currentToken.getFeatures().
                                    get(TOKEN_STRING_FEATURE_NAME));
      if(considerPOSTag != null && considerPOSTag.booleanValue() && !currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) {
        fireProcessFinished();
        if(failOnMissingInputAnnotations) {
          throw new ExecutionException("please run the POS Tagger first and then Morpher");
        } else {
          Utils.logOnce(logger,Level.INFO,"Morphological analyser: no input annotations, run the POS Tagger first - see debug log for details.");
          logger.debug("No input annotations in document "+document.getName());
          return;
        }
        //javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "please run the POS Tagger first and then Morpher"); ;
        //return;
      }

      String posCategory = (String) (currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME));
      if(posCategory == null) {
        posCategory = "*";
      }

      if(considerPOSTag == null || !considerPOSTag.booleanValue()) {
        posCategory = "*";
      }

      // run the Morpher
      if(!caseSensitive.booleanValue()) {
        tokenValue = tokenValue.toLowerCase();
      }

      String baseWord = interpret.runMorpher(tokenValue, posCategory);
      String affixWord = interpret.getAffix();

      // no need to add affix feature if it is null
      if (affixWord != null) {
        currentToken.getFeatures().put(affixFeatureName, affixWord);
      }
      // add the root word as a feature
      currentToken.getFeatures().put(rootFeatureName, baseWord);

      // measure the progress and update every after 100 tokens
      tokensProcessed++;
      if(tokensProcessed - lastReport > 100){
        lastReport = tokensProcessed;
        fireProgressChanged(tokensProcessed * 100 /tokenSize);
      }
    }
    //execute Transducer and Gazetteer
    interrupted = false;
    //set the runtime parameters
    FeatureMap params;
    if(inputASName != null && inputASName.equals("")) inputASName = null;
    if(outputASName != null && outputASName.equals("")) outputASName = null;
    try{
      fireProgressChanged(0);
      params = Factory.newFeatureMap();
      params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document);
      params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName);
      gazetteer.setParameterValues(params);

      params = Factory.newFeatureMap();
      params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
      params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName);
      params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName);
      transducer.setParameterValues(params);
    }catch(Exception e){
      throw new ExecutionException(e);
    }
    ProgressListener pListener = null;
    StatusListener sListener = null;
    fireProgressChanged(5);

    //run the gazetteer
    
    if(isInterrupted()) throw new ExecutionInterruptedException(
        "The execution of the \"" + getName() +
        "\" morphological analyser has been abruptly interrupted!");
    pListener = new IntervalProgressListener(5, 10);
    sListener = new StatusListener(){
      @Override
      public void statusChanged(String text){
        fireStatusChanged(text);
      }
    };   
     
    //run the transducer
    if(isInterrupted()) throw new ExecutionInterruptedException(
        "The execution of the \"" + getName() +
        "\" morphological analyser has been abruptly interrupted!");
    pListener = new IntervalProgressListener(11, 90);
    transducer.addProgressListener(pListener);
    transducer.addStatusListener(sListener);
    Benchmark.executeWithBenchmarking(transducer,
            Benchmark.createBenchmarkId("MorphTransducer",
                    getBenchmarkId()), this, null);
    transducer.removeProgressListener(pListener);
    transducer.removeStatusListener(sListener);
    // end execute Transducer and Gazetteer
    if(isInterrupted()) throw new ExecutionInterruptedException(
        "The execution of the \"" + getName() +
        "\" morphological analyser has been abruptly interrupted!");
    pListener = new IntervalProgressListener(50, 100);
    transducer.addProgressListener(pListener);
    transducer.addStatusListener(sListener);
    Benchmark.executeWithBenchmarking(transducer,
            Benchmark.createBenchmarkId("transducer",
                    getBenchmarkId()), this, null);
    transducer.removeProgressListener(pListener);
    transducer.removeStatusListener(sListener);
    //End Transducer Execute
    
    // Execute Flexi Gazetteer
    flexigazetteer.addProgressListener(pListener);
    flexigazetteer.addStatusListener(sListener);
    flexigazetteer.setDocument(document);
    flexigazetteer.execute();
    flexigazetteer.removeProgressListener(pListener);
    flexigazetteer.removeStatusListener(sListener);
    
    validator.setDocument(document);
    validator.execute();
    
    // process finished, acknowledge user about this.
    fireProcessFinished();
  }

  /**
   * This method should only be called after init()
   * @param word
   * @return the rootWord
   */
  public String findBaseWord(String word, String cat) {
    return interpret.runMorpher(word, cat);
  }

  /**
   * This method should only be called after init()
   * @param word
   * @return the afix of the rootWord
   */
  public String findAffix(String word, String cat) {
    interpret.runMorpher(word, cat);
    return interpret.getAffix();
  }

  /**
   * Sets the rule file to be processed
   * @param rulesFile - rule File name to be processed
   */
  @CreoleParameter(comment = "File which defines rules for the morphological analysis", defaultValue = "resources/morph/default.rul")
  public void setRulesFile(URL rulesFile) {
    this.rulesFile = rulesFile;
  }

  /**
   * Returns the document under process
   */
  public URL getRulesFile() {
    return this.rulesFile;
  }

  /**
   * Returns the feature name that has been currently set to display the root
   * word
   */
  public String getRootFeatureName() {
    return rootFeatureName;
  }

  /**
   * Sets the feature name that should be displayed for the root word
   * @param rootFeatureName
   */
  @RunTime
  @CreoleParameter(comment="Name of the variable which shows the root word",defaultValue="lemma")
  public void setRootFeatureName(String rootFeatureName) {
    this.rootFeatureName = rootFeatureName;
  }

  /**
   * Returns the feature name that has been currently set to display the affix
   * word
   */
  public String getAffixFeatureName() {
    return affixFeatureName;
  }

  /**
   * Sets the feature name that should be displayed for the affix
   * @param affixFeatureName
   */
  @RunTime
  @CreoleParameter(comment="Name of the affix variable", defaultValue="affix")
  public void setAffixFeatureName(String affixFeatureName) {
    this.affixFeatureName = affixFeatureName;
  }

  /**
   * Returns the name of the AnnotationSet that has been provided to create
   * the AnnotationSet
   */
  public String getAnnotationSetName() {
    return annotationSetName;
  }

  /**
   * Sets the AnnonationSet name, that is used to create the AnnotationSet
   * @param annotationSetName
   */
  @RunTime
  @Optional
  @CreoleParameter(comment="The name of the annotation set used for input")
  public void setAnnotationSetName(String annotationSetName) {
    this.annotationSetName = annotationSetName;
  }

  /**
   * A method which returns if the parser is in caseSenstive mode
   * @return a {@link Boolean} value.
   */
  public Boolean getCaseSensitive() {
    return this.caseSensitive;
  }

  /**
   * Sets the caseSensitive value, that is used to tell parser if it should
   * convert document to lowercase before parsing
   */
  @CreoleParameter(comment="If parser should be converted to lowercase first", defaultValue="false")
  public void setCaseSensitive(java.lang.Boolean value) {
    this.caseSensitive = value;
  }
  
  /**
   * A method which returns if Part of Speech input is present
   * @return a {@link Boolean} value.
   */
  public Boolean getConsiderPOSTag() {
    return this.considerPOSTag;
  }
  
  /**
   * Sets the result of checking for Part of Speech input availability
   */
  @RunTime
  @CreoleParameter(comment="If parser should consider POS Tag prior to running Morph", defaultValue="true")
  public void setConsiderPOSTag(Boolean value) {
    this.considerPOSTag = value;
  }
  
  /**
   * Only for use by the duplication mechanism.
   */
  public void setExistingInterpret(Interpret existingInterpret) {
    this.existingInterpret = existingInterpret;
  }

  /**
   * Duplicate this morpher, sharing the compiled regular expression
   * patterns and finite state machine with the duplicate.
   */
  @Override
  public Resource duplicate(DuplicationContext ctx)
          throws ResourceInstantiationException {
    String className = this.getClass().getName();
    String resName = this.getName();
    FeatureMap initParams = getInitParameterValues();
    initParams.put("existingInterpret", interpret);
    Resource res = Factory.createResource(className, initParams, this.getFeatures(), resName);
    res.setParameterValues(getRuntimeParameterValues());
    return res;
  }
  /**
   * Sets the location of the lexicon responsible for providing word lemmas
   */
  @Optional
  @CreoleParameter(comment="The URL to the lexicon file", defaultValue="resources/morph/lexicon")
  public void setLexiconURL(java.net.URL lexiconURL) {
    this.lexiconURL = lexiconURL;
  }
  public java.net.URL getLexiconURL() {
    return this.lexiconURL;
  }
  protected URL lexiconURL;
  
  /**
   * Sets the encoding of the lexicon responsible for providing word lemmas
   */
  @Optional
  @CreoleParameter(comment="The encoding used for lexicon", defaultValue="UTF-8")
  public void setEncoding(String encoding) {
    this.encoding = encoding;
  }
  public String getEncoding() {
	    return this.encoding;
	  }
  protected String encoding;
  
  /**
   * Notifies all the PRs in this controller that they should stop their
   * execution as soon as possible.
   */
  @Override
  public synchronized void interrupt(){
    interrupted = true;
    gazetteer.interrupt();
    transducer.interrupt();
  }
  
  /**
   * Sets the location of post-processing transducer
   */
  @CreoleParameter(defaultValue="resources/morph/grammar/postprocess.jape", comment="The URL to the custom Jape grammar file", suffixes="jape")
  public void setTransducerURL(java.net.URL newTransducerURL) {
    transducerURL = newTransducerURL;
  }
  
  /**
   * Returns the location of post-processing transducer
   */
  public java.net.URL getTransducerURL() {
    return transducerURL;
  }
  
  /**
   * Sets the location of mutations validation transducer
   */
  @CreoleParameter(defaultValue="resources/morph/grammar/validation-main.jape", comment="The URL to the custom Jape grammar file", suffixes="jape")
  public void setValidationTransducerURL(java.net.URL newValidationTransducerURL) {
	 validationTransducerURL = newValidationTransducerURL;
  }
  /**
   * Returns the location of mutations validation transducer
   */
  public java.net.URL getValidationTransducerURL() {
    return validationTransducerURL;
  }
  
  DefaultGazetteer gazetteer;
  Transducer transducer;
  Transducer validator;
  FlexibleGazetteer flexigazetteer;
  private java.net.URL transducerURL;
  private java.net.URL validationTransducerURL;
  private java.net.URL gazetteerListsURL;
  
  /**
   * Sets the location of gazetteer list used for validating mutations suggested by post-processing
   */
  @Optional
  @CreoleParameter(defaultValue="resources/morph/gazetteer/lists.def", comment="The URL to the custom list lookup definition file", suffixes="def")
  public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) {
    gazetteerListsURL = newGazetteerListsURL;
  }
  public java.net.URL getGazetteerListsURL() {
    return gazetteerListsURL;
  }
  //end transducer and gazetteer 
  
  private String benchmarkId;
  
  /* (non-Javadoc)
   * @see gate.util.Benchmarkable#setBenchmarkId(java.lang.String)
   */
  @Override
  public void setBenchmarkId(String benchmarkId) {
    this.benchmarkId = benchmarkId;
  }
  
  /* (non-Javadoc)
   * @see gate.util.Benchmarkable#getBenchmarkId()
   */
  @Override
  public String getBenchmarkId() {
    if(benchmarkId == null) {
      return getName();
    }
    else {
      return benchmarkId;
    }
  }
  private String inputASName;
  private String outputASName;
}