/* * WelshMorph.java * This file is part of Welsh Natural Language Toolkit (WNLT) * (see http://gate.ac.uk/), and is free software, licenced under * the GNU Library General Public License, Version 2, June 1991 * */ package wnlt.morph; import gate.Annotation; import gate.AnnotationSet; import gate.Factory; import gate.Factory.DuplicationContext; import gate.FeatureMap; import gate.Gate; import gate.ProcessingResource; import gate.Resource; import gate.Utils; import gate.creole.AbstractLanguageAnalyser; import gate.creole.CustomDuplication; import gate.creole.ExecutionException; import gate.creole.ExecutionInterruptedException; import gate.creole.ResourceInstantiationException; import gate.creole.Transducer; import gate.creole.gazetteer.DefaultGazetteer; import gate.creole.gazetteer.FlexibleGazetteer; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; import gate.event.ProgressListener; import gate.event.StatusListener; import gate.util.Benchmark; import gate.util.Benchmarkable; import gate.util.GateRuntimeException; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.log4j.Level; import org.apache.log4j.Logger; import wnlt.LexiconCY; /** * Description: This class is a wrapper for {@link wnlt.morph.Interpret}, * the Morphological Analyzer. The class is based on and modifies for the * purposes of the Welsh Natural Language Toolkit the Morph class * of the GATE Morphological analyser. * * @author Andreas Vlachidis 20/03/2016 * */ @CreoleResource(name = "Welsh Morphological Analyser", comment = "Morphological Analyzer of the Welsh Natural Language Toolkit", icon="welsh_lemmatiser.png") public class WelshMorph extends AbstractLanguageAnalyser implements ProcessingResource, CustomDuplication, Benchmarkable { // note that this package could probably be simplified as the only modified // class from the original is Interpret so we may be able to just use the // existing classes, but for safety we currently use an entire copy, although // this may make bug fixing harder in the future private static final long serialVersionUID = 6964689654685956128L; /** File which contains rules to be processed */ protected URL rulesFile; /** Instance of BaseWord class - Welsh Morpher */ protected Interpret interpret; /** Feature Name that should be displayed for the root word */ protected String rootFeatureName; /** Feature Name that should be displayed for the affix */ protected String affixFeatureName; /** The name of the annotation set used for input */ protected String annotationSetName; /** Boolean value that tells if parser should behave in caseSensitive mode */ protected Boolean caseSensitive; /** Boolean value that checks if the required Part of Speech input is available */ protected Boolean considerPOSTag; /** * If this Morph PR is a duplicate of an existing PR, this property * will hold a reference to the original PR's Interpret instance. */ protected Interpret existingInterpret; /** Lexicon of lemmas : read from an external file */ protected LexiconCY lexicon; /** Path to the lexicon containing word lemmas*/ public static final String TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL"; /** Encoding of lexicon and gazetteer lists*/ public static final String TAG_ENCODING_PARAMETER_NAME = "encoding"; /** Post process transducer and gazeteer*/ public static final String SPLIT_DOCUMENT_PARAMETER_NAME = "document"; /** Name of input Annotation Set*/ public static final String SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName"; /** Name of output Annotation Set*/ public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName"; /** Path to gazetteer lists file*/ public static final String SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL"; /** Path to Post-processing JAPE transducer for mutation behaviour */ public static final String SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL"; /** Path to validation JAPE transducer of proposed mutations*/ public static final String MORPH_VALIDTRANSD_URL_PARAMETER_NAME = "validationTransducerURL"; @RunTime @Optional @CreoleParameter( comment = "Throw and exception when there are none of the required input annotations", defaultValue = "true") public void setFailOnMissingInputAnnotations(Boolean fail) { failOnMissingInputAnnotations = fail; } public Boolean getFailOnMissingInputAnnotations() { return failOnMissingInputAnnotations; } protected Boolean failOnMissingInputAnnotations = false; protected Logger logger = Logger.getLogger(this.getClass().getName()); /** Default Constructor */ public WelshMorph() { } /** * This method creates the instance of the BaseWord - Welsh Morpher and * returns the instance of current class with different attributes and * the instance of BaseWord class wrapped into it. * The method also instantiates the post-processing transducer, * and mutation validation gazetteer and transducer * @return Resource * @throws ResourceInstantiationException */ @Override public Resource init() throws ResourceInstantiationException { interpret = new Interpret(); if(existingInterpret != null) { interpret.init(existingInterpret); } else { if (rulesFile == null) { // no rule file is there, simply run the interpret to interpret it and throw new ResourceInstantiationException("\n\n No Rule File Provided"); } fireStatusChanged("Reading Rule File..."); // compile the rules interpret.init(rulesFile, getLexiconURL(), getEncoding()); fireStatusChanged("Morpher created!"); fireProcessFinished(); } //create Transducer and Gazetteer FeatureMap params; FeatureMap features; params = Factory.newFeatureMap(); if(gazetteerListsURL != null) params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME, gazetteerListsURL); params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding); params.put(DefaultGazetteer.DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME, "false"); if (gazetteer == null) { //gazetteer fireStatusChanged("Creating the gazetteer"); features = Factory.newFeatureMap(); Gate.setHiddenAttribute(features, true); gazetteer = (DefaultGazetteer)Factory.createResource( "gate.creole.gazetteer.DefaultGazetteer", params, features); gazetteer.setName("Gazetteer " + System.currentTimeMillis()); } else { gazetteer.setParameterValues(params); gazetteer.reInit(); } //create a flexible gazetteer params = Factory.newFeatureMap(); List<String> inputFeatures = new ArrayList<String>(); String rootFeature = "Token." + this.getRootFeatureName(); inputFeatures.add(rootFeature); //Token.altLemma is used for additional cases of mutation e.g (f) from (m or b) inputFeatures.add("Token.altLemma"); params.put("inputFeatureNames", inputFeatures); params.put("gazetteerInst",gazetteer); if (flexigazetteer == null) { fireStatusChanged("Creating the Flexible Gazetteer"); features = Factory.newFeatureMap(); Gate.setHiddenAttribute(features, true); flexigazetteer = (FlexibleGazetteer) Factory.createResource( "gate.creole.gazetteer.FlexibleGazetteer", params, features); flexigazetteer.setName("FlexibleGazetteer " + System.currentTimeMillis()); } else { flexigazetteer.setParameterValues(params); flexigazetteer.reInit(); } fireProgressChanged(10); params = Factory.newFeatureMap(); if(transducerURL != null) params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL); params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding); if (transducer == null) { //transducer fireStatusChanged("Creating the JAPE transducer"); features = Factory.newFeatureMap(); Gate.setHiddenAttribute(features, true); transducer = (Transducer)Factory.createResource( "gate.creole.Transducer", params, features); transducer.setName("Transducer " + System.currentTimeMillis()); } else { transducer.setParameterValues(params); transducer.reInit(); } params = Factory.newFeatureMap(); if(transducerURL != null) params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, validationTransducerURL); params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding); if (validator == null) { //transducer fireStatusChanged("Creating the JAPE validator transducer"); features = Factory.newFeatureMap(); Gate.setHiddenAttribute(features, true); validator = (Transducer)Factory.createResource( "gate.creole.Transducer", params, features); validator.setName("Transducer validator " + System.currentTimeMillis()); } else { validator.setParameterValues(params); validator.reInit(); } fireProgressChanged(100); fireProcessFinished(); return this; } /** * Method is executed after the init() method has finished its execution. * <BR>Method does the following operations: * <OL type="1"> * <LI> creates the annotationSet * <LI> fetches word tokens from the document, one at a time * <LI> runs the morpher on each individual word token * <LI> finds the root and the affix for that word * <LI> adds them as features to the current token * @throws ExecutionException */ @Override public void cleanup() { Factory.deleteResource(transducer); } @Override public void execute() throws ExecutionException { // lets start the progress and initialize the progress counter fireProgressChanged(0); // If no document provided to process throw an exception if (document == null) { fireProcessFinished(); throw new GateRuntimeException("No document to process!"); } // get the annotationSet name provided by the user, or otherwise use the // default method AnnotationSet inputAs = (annotationSetName == null || annotationSetName.length() == 0) ? document.getAnnotations() : document.getAnnotations(annotationSetName); // Morpher requires tokenizer to be run before running the Morpher // Fetch tokens from the document AnnotationSet tokens = inputAs.get(TOKEN_ANNOTATION_TYPE); if (tokens == null || tokens.isEmpty()) { fireProcessFinished(); if(failOnMissingInputAnnotations) { throw new ExecutionException("Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher"); } else { Utils.logOnce(logger,Level.INFO,"Morphological analyser: either a document does not have any contents or run the POS Tagger first - see debug log for details."); logger.debug("No input annotations in document "+document.getName()); return; } //javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher"); ; //return; } // create iterator to get access to each and every individual token Iterator<Annotation> tokensIter = tokens.iterator(); // variables used to keep track on progress int tokenSize = tokens.size(); int tokensProcessed = 0; int lastReport = 0; //lets process each token one at a time while (tokensIter != null && tokensIter.hasNext()) { Annotation currentToken = tokensIter.next(); String tokenValue = (String) (currentToken.getFeatures(). get(TOKEN_STRING_FEATURE_NAME)); if(considerPOSTag != null && considerPOSTag.booleanValue() && !currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) { fireProcessFinished(); if(failOnMissingInputAnnotations) { throw new ExecutionException("please run the POS Tagger first and then Morpher"); } else { Utils.logOnce(logger,Level.INFO,"Morphological analyser: no input annotations, run the POS Tagger first - see debug log for details."); logger.debug("No input annotations in document "+document.getName()); return; } //javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "please run the POS Tagger first and then Morpher"); ; //return; } String posCategory = (String) (currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)); if(posCategory == null) { posCategory = "*"; } if(considerPOSTag == null || !considerPOSTag.booleanValue()) { posCategory = "*"; } // run the Morpher if(!caseSensitive.booleanValue()) { tokenValue = tokenValue.toLowerCase(); } String baseWord = interpret.runMorpher(tokenValue, posCategory); String affixWord = interpret.getAffix(); // no need to add affix feature if it is null if (affixWord != null) { currentToken.getFeatures().put(affixFeatureName, affixWord); } // add the root word as a feature currentToken.getFeatures().put(rootFeatureName, baseWord); // measure the progress and update every after 100 tokens tokensProcessed++; if(tokensProcessed - lastReport > 100){ lastReport = tokensProcessed; fireProgressChanged(tokensProcessed * 100 /tokenSize); } } //execute Transducer and Gazetteer interrupted = false; //set the runtime parameters FeatureMap params; if(inputASName != null && inputASName.equals("")) inputASName = null; if(outputASName != null && outputASName.equals("")) outputASName = null; try{ fireProgressChanged(0); params = Factory.newFeatureMap(); params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document); params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName); gazetteer.setParameterValues(params); params = Factory.newFeatureMap(); params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document); params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName); params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName); transducer.setParameterValues(params); }catch(Exception e){ throw new ExecutionException(e); } ProgressListener pListener = null; StatusListener sListener = null; fireProgressChanged(5); //run the gazetteer if(isInterrupted()) throw new ExecutionInterruptedException( "The execution of the \"" + getName() + "\" morphological analyser has been abruptly interrupted!"); pListener = new IntervalProgressListener(5, 10); sListener = new StatusListener(){ @Override public void statusChanged(String text){ fireStatusChanged(text); } }; //run the transducer if(isInterrupted()) throw new ExecutionInterruptedException( "The execution of the \"" + getName() + "\" morphological analyser has been abruptly interrupted!"); pListener = new IntervalProgressListener(11, 90); transducer.addProgressListener(pListener); transducer.addStatusListener(sListener); Benchmark.executeWithBenchmarking(transducer, Benchmark.createBenchmarkId("MorphTransducer", getBenchmarkId()), this, null); transducer.removeProgressListener(pListener); transducer.removeStatusListener(sListener); // end execute Transducer and Gazetteer if(isInterrupted()) throw new ExecutionInterruptedException( "The execution of the \"" + getName() + "\" morphological analyser has been abruptly interrupted!"); pListener = new IntervalProgressListener(50, 100); transducer.addProgressListener(pListener); transducer.addStatusListener(sListener); Benchmark.executeWithBenchmarking(transducer, Benchmark.createBenchmarkId("transducer", getBenchmarkId()), this, null); transducer.removeProgressListener(pListener); transducer.removeStatusListener(sListener); //End Transducer Execute // Execute Flexi Gazetteer flexigazetteer.addProgressListener(pListener); flexigazetteer.addStatusListener(sListener); flexigazetteer.setDocument(document); flexigazetteer.execute(); flexigazetteer.removeProgressListener(pListener); flexigazetteer.removeStatusListener(sListener); validator.setDocument(document); validator.execute(); // process finished, acknowledge user about this. fireProcessFinished(); } /** * This method should only be called after init() * @param word * @return the rootWord */ public String findBaseWord(String word, String cat) { return interpret.runMorpher(word, cat); } /** * This method should only be called after init() * @param word * @return the afix of the rootWord */ public String findAffix(String word, String cat) { interpret.runMorpher(word, cat); return interpret.getAffix(); } /** * Sets the rule file to be processed * @param rulesFile - rule File name to be processed */ @CreoleParameter(comment = "File which defines rules for the morphological analysis", defaultValue = "resources/morph/default.rul") public void setRulesFile(URL rulesFile) { this.rulesFile = rulesFile; } /** * Returns the document under process */ public URL getRulesFile() { return this.rulesFile; } /** * Returns the feature name that has been currently set to display the root * word */ public String getRootFeatureName() { return rootFeatureName; } /** * Sets the feature name that should be displayed for the root word * @param rootFeatureName */ @RunTime @CreoleParameter(comment="Name of the variable which shows the root word",defaultValue="lemma") public void setRootFeatureName(String rootFeatureName) { this.rootFeatureName = rootFeatureName; } /** * Returns the feature name that has been currently set to display the affix * word */ public String getAffixFeatureName() { return affixFeatureName; } /** * Sets the feature name that should be displayed for the affix * @param affixFeatureName */ @RunTime @CreoleParameter(comment="Name of the affix variable", defaultValue="affix") public void setAffixFeatureName(String affixFeatureName) { this.affixFeatureName = affixFeatureName; } /** * Returns the name of the AnnotationSet that has been provided to create * the AnnotationSet */ public String getAnnotationSetName() { return annotationSetName; } /** * Sets the AnnonationSet name, that is used to create the AnnotationSet * @param annotationSetName */ @RunTime @Optional @CreoleParameter(comment="The name of the annotation set used for input") public void setAnnotationSetName(String annotationSetName) { this.annotationSetName = annotationSetName; } /** * A method which returns if the parser is in caseSenstive mode * @return a {@link Boolean} value. */ public Boolean getCaseSensitive() { return this.caseSensitive; } /** * Sets the caseSensitive value, that is used to tell parser if it should * convert document to lowercase before parsing */ @CreoleParameter(comment="If parser should be converted to lowercase first", defaultValue="false") public void setCaseSensitive(java.lang.Boolean value) { this.caseSensitive = value; } /** * A method which returns if Part of Speech input is present * @return a {@link Boolean} value. */ public Boolean getConsiderPOSTag() { return this.considerPOSTag; } /** * Sets the result of checking for Part of Speech input availability */ @RunTime @CreoleParameter(comment="If parser should consider POS Tag prior to running Morph", defaultValue="true") public void setConsiderPOSTag(Boolean value) { this.considerPOSTag = value; } /** * Only for use by the duplication mechanism. */ public void setExistingInterpret(Interpret existingInterpret) { this.existingInterpret = existingInterpret; } /** * Duplicate this morpher, sharing the compiled regular expression * patterns and finite state machine with the duplicate. */ @Override public Resource duplicate(DuplicationContext ctx) throws ResourceInstantiationException { String className = this.getClass().getName(); String resName = this.getName(); FeatureMap initParams = getInitParameterValues(); initParams.put("existingInterpret", interpret); Resource res = Factory.createResource(className, initParams, this.getFeatures(), resName); res.setParameterValues(getRuntimeParameterValues()); return res; } /** * Sets the location of the lexicon responsible for providing word lemmas */ @Optional @CreoleParameter(comment="The URL to the lexicon file", defaultValue="resources/morph/lexicon") public void setLexiconURL(java.net.URL lexiconURL) { this.lexiconURL = lexiconURL; } public java.net.URL getLexiconURL() { return this.lexiconURL; } protected URL lexiconURL; /** * Sets the encoding of the lexicon responsible for providing word lemmas */ @Optional @CreoleParameter(comment="The encoding used for lexicon", defaultValue="UTF-8") public void setEncoding(String encoding) { this.encoding = encoding; } public String getEncoding() { return this.encoding; } protected String encoding; /** * Notifies all the PRs in this controller that they should stop their * execution as soon as possible. */ @Override public synchronized void interrupt(){ interrupted = true; gazetteer.interrupt(); transducer.interrupt(); } /** * Sets the location of post-processing transducer */ @CreoleParameter(defaultValue="resources/morph/grammar/postprocess.jape", comment="The URL to the custom Jape grammar file", suffixes="jape") public void setTransducerURL(java.net.URL newTransducerURL) { transducerURL = newTransducerURL; } /** * Returns the location of post-processing transducer */ public java.net.URL getTransducerURL() { return transducerURL; } /** * Sets the location of mutations validation transducer */ @CreoleParameter(defaultValue="resources/morph/grammar/validation-main.jape", comment="The URL to the custom Jape grammar file", suffixes="jape") public void setValidationTransducerURL(java.net.URL newValidationTransducerURL) { validationTransducerURL = newValidationTransducerURL; } /** * Returns the location of mutations validation transducer */ public java.net.URL getValidationTransducerURL() { return validationTransducerURL; } DefaultGazetteer gazetteer; Transducer transducer; Transducer validator; FlexibleGazetteer flexigazetteer; private java.net.URL transducerURL; private java.net.URL validationTransducerURL; private java.net.URL gazetteerListsURL; /** * Sets the location of gazetteer list used for validating mutations suggested by post-processing */ @Optional @CreoleParameter(defaultValue="resources/morph/gazetteer/lists.def", comment="The URL to the custom list lookup definition file", suffixes="def") public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) { gazetteerListsURL = newGazetteerListsURL; } public java.net.URL getGazetteerListsURL() { return gazetteerListsURL; } //end transducer and gazetteer private String benchmarkId; /* (non-Javadoc) * @see gate.util.Benchmarkable#setBenchmarkId(java.lang.String) */ @Override public void setBenchmarkId(String benchmarkId) { this.benchmarkId = benchmarkId; } /* (non-Javadoc) * @see gate.util.Benchmarkable#getBenchmarkId() */ @Override public String getBenchmarkId() { if(benchmarkId == null) { return getName(); } else { return benchmarkId; } } private String inputASName; private String outputASName; }