/* * HeppleCY.java * This file is part of Welsh Natural Language Toolkit (WNLT) * (see http://gate.ac.uk/), and is free software, licenced under * the GNU Library General Public License, Version 2, June 1991 * * */ package wnlt; import java.io.IOException; import java.net.URL; import java.util.List; import hepple.postag.InvalidRuleException; import hepple.postag.POSTagger; /** * Extends the Mark Hepple's POS tagger POSTagger class for the purposes * of the Welsh Natural Language Toolkit WNLT. The class extends the original Hepple Tagger * with conditional statements that classify words using linguistic evidence. * * @author Andreas Vlachidis 20/03/2016 * */ public class HeppleCY extends POSTagger { LexiconCY lexicon; static final String staart = "STAART"; private String[] staartLex = { staart }; private String[] deflex_NNM = { "NNM"}; private String[] deflex_NNF = { "NNF"}; private String[] deflex_JJ = { "JJ"}; private String[] deflex_VB = { "VB"}; private String[] deflex_VBI = { "VBI"}; private String[] deflex_NNP = { "NNP"}; private String[] deflex_CD = { "CD"}; private String[] deflex_NNS = { "NNS"}; private String[] deflex_NN = { "NN"}; private String[] deflex_PN = { "PN"}; private String[] deflex_SC = { "SC"}; /** * Construct a POS tagger using the platform's native encoding to read the * lexicon and rules files. */ public HeppleCY(URL lexiconURL, URL rulesURL) throws InvalidRuleException, IOException { super(lexiconURL, rulesURL, null); } /** * Construct a POS tagger using the specified encoding to read the lexicon * and rules files. */ public HeppleCY(URL lexiconURL, URL rulesURL, String encoding) throws InvalidRuleException, IOException{ super(lexiconURL, rulesURL, encoding); this.lexicon = new LexiconCY(lexiconURL, encoding); } /** * Attempts to classify an unknown word. * @param wd the word to be classified */ protected String[] classifyWord(String wd){ String[] result; if (staart.equals(wd)) return staartLex; List<String> categories = lexicon.get(wd.toLowerCase()); if(categories != null){ result = new String[categories.size()]; for(int i = 0; i < result.length; i++){ result[i] = categories.get(i); } return result; } //no lexical entry for the word. Try to guess if ('A' <= wd.charAt(0) && wd.charAt(0) <= 'Z') return deflex_NNP; for (int i=0 ; i < wd.length() ; i++) if ('0' <= wd.charAt(i) && wd.charAt(i) <= '9') return deflex_CD; if (wd.endsWith("d")) { if (wd.endsWith("id") || wd.endsWith("od") || wd.endsWith("awd")) { return deflex_NNM; } if (wd.endsWith("ydd")){ if (wd.endsWith("feydd")){ return deflex_NNS; } else { return deflex_NNM; } } if (wd.endsWith("edd")) { if (wd.endsWith("oedd")){ return deflex_NNF; } else { return deflex_NNM; } } if (wd.endsWith("yd")) { if (wd.endsWith("lyd")){ return deflex_JJ; } else { return deflex_NNM; } } if (!wd.endsWith("ad") || !wd.endsWith("aid") || !wd.endsWith("ed")) { return deflex_VB; } } //End of ends with d if (wd.endsWith("eb")) { if (wd.endsWith("deb") || wd.endsWith("ineb")){ return deflex_NNM; } else { return deflex_NNF; } }//End of ends with eb if (wd.endsWith("el")) { if (wd.endsWith("fel")){ return deflex_NNM; } else { return deflex_VB; } }//End of ends with el if (wd.endsWith("ur")) { if (wd.endsWith("adur")){ return deflex_NNM; } else { return deflex_VB; } }//End of ends with ur if (wd.endsWith("u")) { if (wd.endsWith("au")){ return deflex_NNS; } else { return deflex_VBI; } }//End of ends with u if (wd.endsWith("a")) { if (wd.endsWith("dra")){ return deflex_NNM; } else if (wd.endsWith("fa")){ return deflex_NNF; } else { return deflex_VB; } }//End of ends with a if (wd.endsWith("ig")) { if (wd.endsWith("wraig")){ return deflex_NNF; } else { return deflex_JJ; } }//End of ends with ig if (wd.endsWith("aint") || wd.endsWith("cyn") || wd.endsWith("der") || wd.endsWith("iant") || wd.endsWith("mon") || wd.endsWith("wr") || wd.endsWith("yr")) return deflex_NNM; //End return NNM - Noun Masculine if (wd.endsWith("ell") || wd.endsWith("en") || wd.endsWith("es") || wd.endsWith("red")) return deflex_NNF; //End return NNF - Noun Feminine if (wd.endsWith("fan") || wd.endsWith("ain") || wd.endsWith("o")) return deflex_VB; //End return VB - Verb if (wd.endsWith("adwy") || wd.endsWith("aidd") || wd.endsWith("ar") || wd.endsWith("ed") || wd.endsWith("gar") || wd.endsWith("lon") || wd.endsWith("us")) return deflex_JJ; //End return JJ - Adjective if (wd.endsWith("os")) return deflex_NNS; //End return JJ - Noun Plural if (wd.matches("[^A-Za-z0-9\\s’'\\[\\]\\(\\)\\{\\}⟨⟩:,،、‒…\\-\\!\\.?‘’“”'\\\";/⁄]")) return deflex_SC; //End return JJ - Noun Special Character if (wd.matches("[’'\\[\\]\\(\\)\\{\\}⟨⟩:,،、‒…\\-\\!\\.?‘’“”'\\\";/⁄]")) return deflex_PN; //End return PN - Punctuation return deflex_NN; //Return NN if non of the above conditions are true }//End private String[] classifyWord(String wd) }