/* * Interpret.java * This file is part of Welsh Natural Language Toolkit (WNLT) * (see http://gate.ac.uk/), and is free software, licenced under * the GNU Library General Public License, Version 2, June 1991 * */ package wnlt.morph; import gate.creole.ResourceInstantiationException; import java.lang.reflect.Method; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Pattern; import wnlt.LexiconCY; /** * <p> * Title: Interpret.java * </p> * <p> * Description: This is the main class which which should be invoked to load the * rule file in the system and then to execute the program to find the root word * and the affix to it. The class has been modified for the purposes of the * Welsh Natural Language ToolkitS * </p> */ public class Interpret { /** * instance of the ReadFile class which reads the file and stores each line * of the given program in the arraylist which can be read using different * methods of the ReadFile class */ private ReadFile file; /** Boolean variables to keep track on which section is being read */ private boolean isDefineVarSession, isDefineRulesSession; /** Instance of Storage class, which is used store all the variables details */ private Storage variables; /** This variables keeps the record of available methods for the morphing */ private Method[] methods; /** This variables holds the affix */ private String affix; private Pattern vPat = Pattern.compile("((VB)[DGNPZ]?)|(MD)"); private Pattern nPat = Pattern.compile("(NN)[PMFS]?"); MorphFunctions morphInst; List<Pattern> patterns = new ArrayList<Pattern>(); List<List<CharClass>> fsms = new ArrayList<List<CharClass>>(); /** * The initial state of the FSM that backs this morpher */ protected FSMState initialState; //protected Set lastStates; /** Lexicon of lemma input read from an external file */ LexiconCY lexicon; /** * It starts the actual program */ public void init(URL ruleFileURL, URL lexiconURL, String encoding) throws ResourceInstantiationException { variables = new Storage(); prepareListOfMorphMethods(); file = new ReadFile(ruleFileURL); affix = null; isDefineRulesSession = false; isDefineVarSession = false; morphInst = new MorphFunctions(); readProgram(); initialState = new FSMState(-1); //lastStates = new HashSet(); interpretProgram(); variables = null; file = null; //lastStates = null; //Instantiate lexicon if(lexiconURL == null){ throw new ResourceInstantiationException( "NoURL provided for the lexicon!"); } try{ this.lexicon = new LexiconCY(lexiconURL, encoding); }catch(Exception e){ throw new ResourceInstantiationException(e); } } /** * Initialize this Interpret by copying pointers to the sharable state * of an existing Interpret instance. */ public void init(Interpret existingInterpret) { affix = null; isDefineRulesSession = false; isDefineVarSession = false; morphInst = new MorphFunctions(); // copy shared state fsms = existingInterpret.fsms; patterns = existingInterpret.patterns; initialState = existingInterpret.initialState; lexicon = existingInterpret.lexicon; } class CharClass { char ch; FSMState st; } public void addState(char ch, FSMState fsm, int index) { if(index == fsms.size()) { fsms.add(new ArrayList<CharClass>()); } List<CharClass> fs = fsms.get(index); for(int i=0;i<fs.size();i++) { CharClass cc = fs.get(i); if(cc.ch == ch) return; } CharClass cc = new CharClass(); cc.ch = ch; cc.st = fsm; fs.add(cc); } public FSMState getState(char ch, int index) { if(index >= fsms.size()) return null; List<CharClass> fs = fsms.get(index); for(int i=0;i<fs.size();i++) { CharClass cc = fs.get(i); if(cc.ch == ch) return cc.st; } return null; } private Set<FSMState> getStates(char ch, Set<FSMState> states) { Set<FSMState> newStates = new HashSet<FSMState>(); Iterator<FSMState> iter = states.iterator(); while (iter.hasNext()) { FSMState st = iter.next(); FSMState chState = st.next(ch, FSMState.CHILD_STATE); if (chState != null) { newStates.add(chState); } FSMState adState = st.next(ch, FSMState.ADJ_STATE); if (adState != null) { newStates.add(adState); } } return newStates; } private boolean validCategory(String category) { if (category.equals("*")) { return true; } else if (vPat.matcher(category).matches()) { return true; } else if (nPat.matcher(category).matches()) { return true; } return false; } /** * @return set of the Lookups associated with the parameter */ public String runMorpher(String word, String category) { String[] result = null; String lexiconResult =""; List<String> lemmas = lexicon.get(word.toLowerCase()); if(lemmas != null){ result = new String[lemmas.size()]; for(int i = 0; i < result.length; i++){ result[i] = lemmas.get(i); lexiconResult += lemmas.get(i); } } // Condition - if there is no match in lexicon then check the rules file. if (lexiconResult.isEmpty()) { affix = null; if(!validCategory(category)) { return word; } foundRule = false; Set<FSMState> states = new HashSet<FSMState>(); states.add(initialState); for (int i = 0; i < word.length(); i++) { char ch = word.charAt(i); states = getStates(ch, states); if (states.isEmpty()) { return word; } } // we have all states here // we obtain all RHSes SortedSet<RHS> rhses = new TreeSet<RHS>(new Comparator<RHS>() { @Override public int compare(RHS r1, RHS r2) { return r1.getPatternIndex() - r2.getPatternIndex(); } }); Iterator<FSMState> iter = states.iterator(); while (iter.hasNext()) { FSMState st = iter.next(); rhses.addAll(st.getRHSes()); } if (rhses.isEmpty()) { return word; } return executeRHSes(rhses, word, category); } else { return lexiconResult; } } protected int patternIndex = -1; public int getPatternIndex() { return patternIndex; } protected String executeRHSes(SortedSet<RHS> rhses, String word, String category) { foundRule = false; // rhses are in sorted order // we need to check if the word is compatible with pattern Iterator<RHS> rhsiter = rhses.iterator(); while (rhsiter.hasNext()){ RHS r1 = rhsiter.next(); String answer = executeRHS(word, category, r1); if (foundRule) { patternIndex = r1.getPatternIndex(); return answer; } } return word; } protected boolean foundRule = false; protected String executeRHS(String word, String category, RHS rhs) { if (category.equals("*")) { return executeRule(word, rhs); } else if (rhs.isVerb() && vPat.matcher(category).matches()) { return executeRule(word, rhs); } else if (rhs.isNoun() && nPat.matcher(category).matches()) { return executeRule(word, rhs); } return word; } private String executeRule(String word, RHS rhs) { Pattern p = patterns.get(rhs.getPatternIndex()); short methodIndex = rhs.getMethodIndex(); if (!p.matcher(word).matches()) { foundRule = false; return word; } // call the appropriate function String[] parameters = rhs.getParameters(); // set the given word in that morph program morphInst.setInput(word); String answer = null; switch (methodIndex) { case ParsingFunctions.IRREG_STEM: answer = morphInst.irreg_stem(parameters[0], parameters[1]); break; case ParsingFunctions.NULL_STEM: answer = morphInst.null_stem(); break; case ParsingFunctions.SEMIREG_STEM: answer = morphInst.semi_reg_stem(Integer.parseInt(parameters[0]), parameters[1]); break; case ParsingFunctions.STEM: answer = morphInst.stem(Integer.parseInt(parameters[0]), parameters[1], parameters[2]); break; default: answer = null; break; } if(answer != null) { this.affix = morphInst.getAffix(); foundRule = true; return answer; } else { foundRule = false; return word; } } /** * This method prepares the list of available methods in the MorphFunctions * class */ private void prepareListOfMorphMethods() throws ResourceInstantiationException { methods = MorphFunctions.class.getDeclaredMethods(); } /** * read the program file */ private void readProgram() throws ResourceInstantiationException { // read the program file boolean readStatus = file.read(); // check if read was success if (!readStatus) { // not it wasn't so simply display the message and ask user to check // it generateError("Some errors reading program file.. please check the" + "program and try again"); } } /** * This method reads each line of the program and interpret them */ private void interpretProgram() throws ResourceInstantiationException { // read each line and parse it while (file.hasNext()) { String currentLine = file.getNext(); if (currentLine == null || currentLine.trim().length() == 0) { continue; } // remove all the leading spaces currentLine = currentLine.trim(); /* * if commandType is 0 ==> defineVars command if commandType is 1 * ==> defineRules command if commandType is 2 ==> variable * declaration if commandType is 3 ==> rule declaration otherwise // * unknown generate error */ int commandType = findCommandType(currentLine); switch (commandType) { case -1: // comment command continue; case 0: // defineVars command defineVarsCommand(); break; case 1: // defineRules command defineRulesCommand(); break; case 2: // variable declaration variableDeclarationCommand(currentLine); break; case 3: // rule declaration ruleDeclarationCommand(currentLine); break; default: generateError("Syntax Error at line " + file.getPointer() + " : " + currentLine); break; } } // end while } /** * This method interprets the line and finds out the type of command and * returns the integer indicating the type of the command * * @param line * The program command to be interpreted * @return and <tt>int</tt> value */ private int findCommandType(String line) { // check for the comment command if (line.substring(0, 2).equals("//") || line.charAt(0) == '#') { return -1; } else if (line.equals("defineVars")) { return 0; } else if (line.equals("defineRules")) { return 1; } else if (isDefineVarSession && line.split("==>").length == 2) { return 2; } else if (isDefineRulesSession && /* * (line.charAt(0) == '{' || line.charAt(0) == '[' || line.charAt(0) == * '(' || line.charAt(0) == '\"') */(line.charAt(0) == '<') && line.split("==>").length == 2) { return 3; } else { return Codes.ERROR_CODE; } } /** * This method processes the command to define the variable section */ private void defineVarsCommand() throws ResourceInstantiationException { // variable section can only be defined once if (isDefineVarSession) { generateError("Variable Section already defined - " + "see line " + file.getPointer()); } else if (isDefineRulesSession) { generateError("Variable section must be declared before the Rule " + "Section - see line " + file.getPointer()); } else { isDefineVarSession = true; } } /** * This method processes the command to define the rule section */ private void defineRulesCommand() throws ResourceInstantiationException { if (isDefineRulesSession) { generateError("Rule Section already defined - see " + "line " + file.getPointer()); } else { isDefineVarSession = false; isDefineRulesSession = true; } } /** * This method processes the command to declare the variable * * @param line */ private void variableDeclarationCommand(String line) throws ResourceInstantiationException { // ok so first find the variable name and the value for it String varName = (line.split("==>"))[0].trim(); String varValue = (line.split("==>"))[1].trim(); // find the type of variable it is int valueType = ParsingFunctions.findVariableType(varValue .trim()); if (valueType == Codes.ERROR_CODE) { generateError(varName + " - Variable Syntax Error - see " + "line" + file.getPointer() + " : " + line); } // based on the variable type create the instance Variable varInst = null; switch (valueType) { case Codes.CHARACTER_RANGE_CODE: varInst = new CharacterRange(); break; case Codes.CHARACTER_SET_CODE: varInst = new CharacterSet(); break; case Codes.STRING_SET_CODE: varInst = new StringSet(); break; } // set the values in the variable if (!varInst.set(varName, varValue)) { generateError(varName + " - Syntax Error while assigning value to the " + "variable - see line" + file.getPointer() + " : " + line); } // and finally add the variable in if (!variables.add(varName, varInst.getPattern())) { generateError(varName.trim() + " - Variable already defined - see " + "line " + file.getPointer() + " : " + line); } varInst.resetPointer(); } /** * This method processes the command to declare the rule * * @param line */ private void ruleDeclarationCommand(String line) throws ResourceInstantiationException { // lets divide the rule into two parts // LHS and RHS. // LHS is a part which requires to be parsed and // RHS should be checked for the legal function name and valid arguments // we process RHS first and then the LHS String[] ruleParts = line.split("==>"); if (ruleParts.length != 2) { generateError("Error in declaring rule at line : " + file.getPointer() + " : " + line); } // now check if the method which has been called in this rule actually // available in the MorphFunction Class String methodCalled = ruleParts[1].trim(); if (!isMethodAvailable(methodCalled)) { // no method is not available so print the syntax error generateError("Syntax error - method does not exists - see " + "line " + file.getPointer() + " : " + line); } // so RHS part is Ok // now we need to check if LHS is written properly // and convert it to the pattern that is recognized by the java String category = ""; // we need to find out the category int i = 1; for (; i < ruleParts[0].length(); i++) { if (ruleParts[0].charAt(i) == '>') break; category = category + ruleParts[0].charAt(i); } if (i >= ruleParts[0].length()) { generateError("Syntax error - pattern not written properly - see " + "line " + file.getPointer() + " : " + line); } RHS rhs = new RHS(ruleParts[1], category, (short)patterns.size()); ruleParts[0] = ruleParts[0].substring(i + 1, ruleParts[0].length()) .trim(); String regExp = ParsingFunctions.convertToRegExp( ruleParts[0], variables); patterns.add(Pattern.compile(regExp)); String[] rules = ParsingFunctions.normlizePattern(regExp); for (int m = 0; m < rules.length; m++) { Set<Set<FSMState>> lss = new HashSet<Set<FSMState>>(); lss.clear(); Set<FSMState> newSet = new HashSet<FSMState>(); newSet.add(initialState); lss.add(newSet); PatternPart parts[] = ParsingFunctions .getPatternParts(rules[m].trim()); for (int j = 0; j < parts.length; j++) { lss = ParsingFunctions.createFSMs(parts[j].getPartString(), parts[j].getType(), lss, this); } Iterator<Set<FSMState>> iter = lss.iterator(); while (iter.hasNext()) { Set<FSMState> set = iter.next(); Iterator<FSMState> subIter = set.iterator(); while (subIter.hasNext()) { FSMState st = subIter.next(); st.addRHS(rhs); } } } //drawFSM(); } @SuppressWarnings("unused") private Set<FSMState> intersect(Set<FSMState> a, Set<FSMState> b) { Set<FSMState> result = new HashSet<FSMState>(); Iterator<FSMState> iter = a.iterator(); while (iter.hasNext()) { FSMState st = iter.next(); if (b.contains(st)) { result.add(st); } } return result; } @SuppressWarnings("unused") private void drawFSM() { // we start with initialState System.out.println("Initial:"); String space = ""; drawFSM(initialState, space); } private void drawFSM(FSMState st, String space) { CharMap map = st.getTransitionFunction(); char[] keys = map.getItemsKeys(); if (keys != null) { System.out.println(space + "Child:"); for (int i = 0; i < keys.length; i++) { System.out.println(space + "'" + keys[i] + "':"); drawFSM(map.get(keys[i], FSMState.CHILD_STATE), space + " "); } } keys = map.getAdjitemsKeys(); if (keys != null) { System.out.println("ADJ:"); for (int i = 0; i < keys.length; i++) { System.out.println(space + "'" + keys[i] + "' :"); // drawFSM(map.get(keys[i], FSMState.ADJ_STATE), space+" "); } } } /** * This method takes a method signature and searches if the method * * @param method * @return a <tt>boolean</tt> value. */ private boolean isMethodAvailable(String method) { // now first find the name of the method // their parameters and their types int index = method.indexOf("("); if (index == -1 || index == 0 || method.charAt(method.length() - 1) != ')') { return false; } String methodName = method.substring(0, index); // now get the parameters String[] parameters; int[] userMethodParams; String arguments = method.substring(index + 1, method.length() - 1); if (arguments == null || arguments.trim().length() == 0) { parameters = null; userMethodParams = null; } else { parameters = method.substring(index + 1, method.length() - 1) .split(","); userMethodParams = new int[parameters.length]; } // find the parameter types // here we define only three types of arguments // String, boolean and int if (parameters != null) { for (int i = 0; i < parameters.length; i++) { if (parameters[i].startsWith("\"") && parameters[i].endsWith("\"")) { userMethodParams[i] = 7; parameters[i] = "java.lang.String"; continue; } else if (ParsingFunctions.isBoolean(parameters[i])) { userMethodParams[i] = 6; parameters[i] = "boolean"; } else if (ParsingFunctions.isInteger(parameters[i])) { userMethodParams[i] = 2; parameters[i] = "int"; } else { // type cannot be recognized so generate error return false; } } } // now parameters have been found, so check them with the available // methods // in the morph function for (int i = 0; i < methods.length; i++) { if (methods[i].getName().equals(methodName)) { // yes method has found now check for the parameters // compatibility Class<?>[] methodParams = methods[i].getParameterTypes(); // first check for the number of parameters if (methods[i].getName().equals("null_stem")) { return true; } if (methodParams.length == parameters.length) { // yes arity has matched // now set the precedence int[] paramPrecedence = new int[methodParams.length]; // assign precedence for (int j = 0; j < methodParams.length; j++) { if (methodParams[j].getName() .equals("java.lang.String")) paramPrecedence[j] = 7; else if (methodParams[j].getName().equals("boolean")) paramPrecedence[j] = 6; else if (methodParams[j].getName().equals("int")) paramPrecedence[j] = 2; else return false; } // if we are here that means all the type matched // so valid method declaration return true; } } } // if we are here that means method doesnot found return false; } /** * Generates the error and stop the execution * * @param mess - * message to be displayed as an error on the standard output */ private void generateError(String mess) throws ResourceInstantiationException { System.out.println("\n\n" + mess); System.out.println("Program terminated..."); throw new ResourceInstantiationException("\n\n" + mess); } /** * Main method * * @param args */ public static void main(String[] args) throws ResourceInstantiationException { if (args == null || args.length < 3) { System.out .println("Usage : Interpret <Rules fileName> <word> <POS>"); System.exit(-1); } Interpret interpret = new Interpret(); try { interpret.init(new URL(args[0]), new URL(args[0]), new String(args[0])); } catch (MalformedURLException mue) { throw new RuntimeException(mue); } String rootWord = interpret.runMorpher(args[1], args[2]); String affix = interpret.getAffix(); System.out.println("Root : " + rootWord); System.out.println("affix : " + affix); } /** * This method tells what was the affix to the provided word * * @return affix */ public String getAffix() { return this.affix; } public FSMState getInitialState() { return initialState; } }