package wnlt.morph; import gate.creole.ResourceInstantiationException; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; public class PatternParser { public static void main(String[] args) { try { BufferedReader in = new BufferedReader(new InputStreamReader( System.in)); variableDeclarationCommand("A ==> [abcdefghijklmnopqrstuvwxyz0123456789-]"); variableDeclarationCommand("V ==> [aeiou]"); variableDeclarationCommand("VI ==> [aeiouy]"); variableDeclarationCommand("C ==> [bcdfghjklmnpqrstvwxyz]"); variableDeclarationCommand("CX ==> [bcdfghjklmnpqrstvwxz]"); variableDeclarationCommand("CX2 ==> \"bb\" OR \"cc\" OR \"dd\" OR \"ff\" OR \"gg\" OR \"hh\" OR \"jj\" OR \"kk\" OR \"ll\" OR \"mm\" OR \"nn\" OR \"pp\" OR \"qq\" OR \"rr\" OR \"ss\" OR \"tt\" OR \"vv\" OR \"ww\" OR \"xx\" OR \"zz\""); variableDeclarationCommand("CX2S ==> \"ff\" OR \"ss\" OR \"zz\""); variableDeclarationCommand("S ==> \"s\" OR \"x\" OR \"ch\" OR \"sh\""); variableDeclarationCommand("PRE ==> \"be\" OR \"ex\" OR \"in\" OR \"mis\" OR \"pre\" OR \"pro\" OR \"re\""); variableDeclarationCommand("EDING ==> \"ed\" OR \"ing\""); variableDeclarationCommand("ESEDING ==> \"es\" OR \"ed\" OR \"ing\""); while (true) { System.out.print("Query: "); String line = in.readLine(); if (line == null || line.length() < 1) break; getPattern(line); } } catch (Exception e) { e.printStackTrace(); } } public static void getPattern(String line) { String[] ruleParts = line.split("==>"); // now check if the method which has been called in this rule actually // available in the MorphFunction Class //String methodCalled = ruleParts[1].trim(); // so RHS part is Ok // now we need to check if LHS is written properly // and convert it to the pattern that is recognized by the java String category = ""; // we need to find out the category int i = 1; for (; i < ruleParts[0].length(); i++) { if (ruleParts[0].charAt(i) == '>') break; category = category + ruleParts[0].charAt(i); } ruleParts[0] = ruleParts[0].substring(i + 1, ruleParts[0].length()).trim(); String regExp = ParsingFunctions.convertToRegExp(ruleParts[0], variables); String[] rules = ParsingFunctions.normlizePattern(regExp); for (int m = 0; m < rules.length; m++) { PatternPart parts[] = ParsingFunctions.getPatternParts(rules[m].trim()); // each part has a type associated with it for (int j = 0; j < parts.length; j++) { System.out.println(parts[j].getPartString() + "=>" + parts[j].getType()); } } } public final static Storage variables = new Storage(); private static void variableDeclarationCommand(String line) throws ResourceInstantiationException { // ok so first find the variable name and the value for it String varName = (line.split("==>"))[0].trim(); String varValue = (line.split("==>"))[1].trim(); // find the type of variable it is int valueType = ParsingFunctions.findVariableType(varValue.trim()); // based on the variable type create the instance Variable varInst = null; switch (valueType) { case Codes.CHARACTER_RANGE_CODE: varInst = new CharacterRange(); break; case Codes.CHARACTER_SET_CODE: varInst = new CharacterSet(); break; case Codes.STRING_SET_CODE: varInst = new StringSet(); break; } // set the values in the variable if (!varInst.set(varName, varValue)) { } // and finally add the variable in if (!variables.add(varName, varInst.getPattern())) { } varInst.resetPointer(); } public static List<String> parsePattern(String q1) { // arraylist to return - will contain all the OR normalized queries List<String> patterns = new ArrayList<String>(); // remove all extra spaces from the query q1 = q1.trim(); // we add opening and closing brackets explicitly q1 = "( " + q1 + " )"; // add the main Query in the arraylist patterns.add(q1); for (int index = 0; index < patterns.size(); index++) { // get the query to be parsed String query = patterns.get(index); // current character and the previous character char ch = ' ', pre = ' '; // if query is ORed // we need duplication // for example: {A}((B)|(C)) // the normalized form will be // (A)(B) // (A)(C) // here we need (A) to be duplicated two times boolean duplicated = false; int dupliSize = 0; String data = ""; // we need to look into one query at a time and parse it for (int i = 0; i < query.length(); i++) { pre = ch; ch = query.charAt(i); // check if it is an open bracket // it is if it doesn't follow the '\' escape sequence if (isOpenBracket(ch, pre)) { // so find out where it gets closed int brClPos = findBracketClosingPosition(i + 1, query); // see if there are any OR operators in it List<String> orTokens = findOrTokens(query.substring(i + 1, brClPos)); // orTokens will have // for eg. {A} | ({B}{C}) // then {A} // and ({B}{C}) // so basically findOrTokens find out all the tokens around // | operator if (orTokens.size() > 1) { String text = ""; // data contains all the buffered character before the // current positions // for example "ABC" ({B} | {C}) // here "ABC" will be in data // and {B} and {C} in orTokens if (!duplicated && data.length() > 0) { text = data; data = ""; } else { if (index == patterns.size() - 1) { // this is the case where we would select the // text as "" text = ""; } else { text = patterns .get(patterns.size() - 1); } } // so we need to duplicate the text orTokens.size() // times // for example "ABC" ({B} | {C}) // text = "ABC" // orTokens {B} {C} // so two queries will be added // 1. "ABC" // 2. "ABC" patterns = duplicate(patterns, text, dupliSize, orTokens.size()); // and tokens will be added // 1. "ABC" {B} // 2. "ABC" {C} patterns = writeTokens(orTokens, patterns, dupliSize); // text is duplicated so make it true duplicated = true; // and how many times it was duplicated if (dupliSize == 0) dupliSize = 1; dupliSize *= orTokens.size(); } else { // what if the there is only one element between ( and ) // it is not an 'OR' query // check how many times we have duplicated the text if (dupliSize == 0) { // if zero and the text buffered is "" // we simply add "" as a separate Query // otherwise add the buffered data as a separate // Query if (data.length() == 0) patterns.add(""); else patterns.add(data); // because we simply needs to add it only once // but still we have copied it as a separate query // so say duplicated = true duplicated = true; data = ""; // and ofcourse the size of the duplication will be // only 1 dupliSize = 1; } // and we need to add all the contents between two // brackets in the last duplicated // queries patterns = writeStringInAll("<" + query.substring(i + 1, brClPos) + ">", dupliSize, patterns); } i = brClPos; } else { if (duplicated) { patterns = writeCharInAll(ch, dupliSize, patterns); } else { data += "" + ch; } } } boolean scan = scanQueryForOrOrBracket(query); if (scan) { patterns.remove(index); index--; } } List<String> queriesToReturn = new ArrayList<String>(); for (int i = 0; i < patterns.size(); i++) { String q = patterns.get(i); if (q.trim().length() == 0) { continue; } else if (queriesToReturn.contains(q.trim())) { continue; } else { queriesToReturn.add(q.trim()); } } for (int i = 0; i < queriesToReturn.size(); i++) { String s = queriesToReturn.get(i); s = s.replaceAll("<", "("); s = s.replaceAll(">", ")"); s = s.substring(1, s.length() - 1); queriesToReturn.set(i, s.trim()); } return queriesToReturn; } public static boolean scanQueryForOrOrBracket(String query) { int index = 0; int index1 = 0; do { index = query.indexOf('|', index); if (index == 0) { return true; } else if (index > 0) { // we have found it but we need to check if it is an escape // sequence if (query.charAt(index - 1) == '\\') { // yes it is an escape sequence // lets search for the next one } else { return true; } } // if we are here that means it was not found index1 = query.indexOf('(', index1); if (index1 == 0) { return true; } else if (index1 > 0) { // we have found it if (query.charAt(index1 - 1) == '\\') { // yes it is an escape sequence continue; } else { return true; } } } while (index >= 0 && index1 >= 0); return false; } public static List<String> writeTokens(List<String> tokens, List<String> queries, int dupliSize) { if (dupliSize == 0) dupliSize = 1; List<String> qToRemove = new ArrayList<String>(); for (int j = 0; j < dupliSize; j++) { for (int i = 1; i <= tokens.size(); i++) { String token = tokens.get(i - 1); if (token.trim().equals("{__o__}")) { token = " "; } String s = queries.get(queries.size() - (j * tokens.size() + i)); qToRemove.add(s); s += token; queries.set(queries.size() - (j * tokens.size() + i), s); } } // and now remove for (int i = 0; i < qToRemove.size(); i++) { queries.remove(qToRemove.get(i)); } return queries; } public static List<String> duplicate(List<String> queries, String s, int dupliSize, int no) { if (s == null) s = ""; List<String> strings = new ArrayList<String>(); if (dupliSize == 0) { strings.add(s); } else { for (int i = 0; i < dupliSize; i++) { strings.add(queries.get(queries.size() - (i + 1))); } } for (int i = 0; i < strings.size(); i++) { for (int j = 0; j < no; j++) { queries.add(strings.get(i)); } } return queries; } public static List<String> findOrTokens(String query) { int balance = 0; char pre = ' '; char ch = ' '; List<String> ors = new ArrayList<String>(); String s = ""; for (int i = 0; i < query.length(); i++) { pre = ch; ch = query.charAt(i); if (isOpenBracket(ch, pre)) { balance++; s += "" + ch; continue; } if (isClosingBracket(ch, pre) && balance > 0) { balance--; s += "" + ch; continue; } if (isOrSym(ch, pre)) { if (balance > 0) { s += "" + ch; continue; } else { ors.add(s); s = ""; continue; } } s += "" + ch; } if (s.length() > 0) ors.add(s); return ors; } public static int findBracketClosingPosition(int startFrom, String query) { int balance = 0; char pre = ' '; char ch = ' '; for (int i = startFrom; i < query.length(); i++) { pre = ch; ch = query.charAt(i); if (isOpenBracket(ch, pre)) { balance++; continue; } if (isClosingBracket(ch, pre)) { if (balance > 0) { balance--; } else { return i; } } } return -1; } public static List<String> writeCharInAll(char c, int no, List<String> queries) { for (int i = 0; i < no; i++) { String s = queries.get(queries.size() - (i + 1)); s += "" + c; queries.set(queries.size() - (i + 1), s); } return queries; } public static List<String> writeStringInAll(String c, int no, List<String> queries) { for (int i = 0; i < no; i++) { String s = queries.get(queries.size() - (i + 1)); s += "" + c; queries.set(queries.size() - (i + 1), s); } return queries; } public static boolean isOpenBracket(char ch, char pre) { if (ch == '(' && pre != '\\') return true; else return false; } public static boolean isClosingBracket(char ch, char pre) { if (ch == ')' && pre != '\\') return true; else return false; } public static boolean isOrSym(char ch, char pre) { if (ch == '|' && pre != '\\') return true; else return false; } }