package shef.nlp.supple.category; import gate.FeatureMap; import gate.creole.ExecutionException; import gate.util.Err; import gate.util.SimpleFeatureMapImpl; import java.util.Hashtable; import java.util.Iterator; import java.util.TreeSet; import shef.nlp.supple.utils.EmptyFeatureMap; import shef.nlp.supple.utils.IllegalCategoryName; public class Category { static TreeSet ALL_CAT = new TreeSet(); static TreeSet NE_CAT = new TreeSet(); static TreeSet POS_CAT = new TreeSet(); static Hashtable DEF_CAT = new Hashtable(); /* DEFAULT FEATURE MAPS FOR POS AND SYNTACTIC CATEGORIES */ static FeatureMap DEF_POS = new SimpleFeatureMapImpl(); static FeatureMap N_POS = new SimpleFeatureMapImpl(); static FeatureMap A_POS = new SimpleFeatureMapImpl(); static FeatureMap V_POS = new SimpleFeatureMapImpl(); static FeatureMap TOP_POS = new SimpleFeatureMapImpl(); static FeatureMap BOTTOM_POS = new SimpleFeatureMapImpl(); /* CONSTANT FEATURE NAMES VALID FOR BUCHART PARSING */ public static String S_FORM = "s_form"; public static String M_ROOT = "m_root"; public static String M_AFFIX = "m_affix"; public static String TEXT = "text"; public static String EDGE = "edge"; public static String SEM = "sem"; public static String HEAD = "head"; public static String SOURCE = "source"; public static String PERSON = "person"; public static String NUMBER = "number"; public static String GENDER = "gender"; public static String TENSE = "tense"; public static String ASPECT = "aspect"; public static String VOICE = "voice"; public static String VFORM = "vform"; public static String DEGREE = "degree"; public static String NE_TAG = "ne_tag"; public static String NE_TYPE = "ne_type"; /* ANY VALUE */ static String ANY = "_"; static String EMPTY = ""; static String BODY = "body"; String name; public String getCategory() { return name; } public void setCategory(String Name) { name = Name; } FeatureMap features = new SimpleFeatureMapImpl(); public FeatureMap getFeatures() { return features; } public void setFeatures(FeatureMap Features) { features = Features; } static boolean is_ne(String type) { return NE_CAT.contains(type); } public boolean equals(Object o) { if (!(o instanceof Category)) return false; Category c = (Category)o; return name.equals(c.name) && features.equals(c.features); } static { /* FEATURES FOR DEFAULT POS CATEGORIES */ DEF_POS.put(S_FORM,ANY); DEF_POS.put(M_ROOT,EMPTY); DEF_POS.put(M_AFFIX,EMPTY); DEF_POS.put(TEXT,"body"); /* FEATURES FOR NOUN POS CATEGORIES */ N_POS.put(S_FORM,ANY); N_POS.put(M_ROOT,EMPTY); N_POS.put(M_AFFIX,EMPTY); N_POS.put(TEXT,"body"); N_POS.put(PERSON,ANY); N_POS.put(NUMBER,ANY); /* FEATURES FOR ADJ AND ADV CATEGORIES */ A_POS.put(S_FORM,ANY); A_POS.put(M_ROOT,EMPTY); A_POS.put(M_AFFIX,EMPTY); A_POS.put(TEXT,"body"); A_POS.put(DEGREE,ANY); /* FEATURES FOR VERB CATEGORIES */ V_POS.put(S_FORM,ANY); V_POS.put(M_ROOT,EMPTY); V_POS.put(M_AFFIX,EMPTY); V_POS.put(TEXT,"body"); V_POS.put(PERSON,ANY); V_POS.put(NUMBER,ANY); V_POS.put(TENSE,ANY); V_POS.put(VFORM,ANY); /* TOP */ TOP_POS.put(S_FORM,"top"); TOP_POS.put(M_ROOT,"top"); TOP_POS.put(M_AFFIX,EMPTY); TOP_POS.put(TEXT,"body"); /* BOTTOM */ BOTTOM_POS.put(S_FORM,"bottom"); BOTTOM_POS.put(M_ROOT,"bottom"); BOTTOM_POS.put(M_AFFIX,EMPTY); BOTTOM_POS.put(TEXT,"body"); /* SET OF POS CATS FOR BUCHART */ POS_CAT.add("n"); POS_CAT.add("pn"); POS_CAT.add("v"); POS_CAT.add("jj"); POS_CAT.add("rb"); POS_CAT.add("fw"); POS_CAT.add("cd"); POS_CAT.add("cc"); POS_CAT.add("dt"); POS_CAT.add("ex"); POS_CAT.add("in"); POS_CAT.add("ls"); POS_CAT.add("md"); POS_CAT.add("pdt"); POS_CAT.add("pos"); POS_CAT.add("pp"); POS_CAT.add("pps"); POS_CAT.add("rp"); POS_CAT.add("to"); POS_CAT.add("uh"); POS_CAT.add("wdt"); POS_CAT.add("wp"); POS_CAT.add("wrb"); POS_CAT.add("sym"); POS_CAT.add("period"); POS_CAT.add("comma"); POS_CAT.add("top"); POS_CAT.add("bottom"); // NON POS CATS NE_CAT.add("list_np"); /* ALL VALID CATEGORIES */ ALL_CAT.addAll(NE_CAT); ALL_CAT.addAll(POS_CAT); Iterator ite_pos = POS_CAT.iterator(); while(ite_pos.hasNext()) { DEF_CAT.put(ite_pos.next(),DEF_POS); } /* DEFAULT SET OF CATEGORIES AND THEIR FEATURES */ DEF_CAT.put("n",N_POS); DEF_CAT.put("v",V_POS); DEF_CAT.put("jj",A_POS); DEF_CAT.put("pn",N_POS); DEF_CAT.put("rb",A_POS); DEF_CAT.put("top",TOP_POS); DEF_CAT.put("bottom",BOTTOM_POS); DEF_CAT.put("list_np",DEF_POS); } public Category() {} public Category(String Name, FeatureMap Features) { name = Name; features = new SimpleFeatureMapImpl(); features.putAll(Features); } public Category(String Name) throws IllegalCategoryName { if(!ALL_CAT.contains(Name)) { throw new IllegalCategoryName(Name+" is invalid"); } name = Name; } public static Category getDefaultCategory(String Name) { return new Category(Name,(FeatureMap) DEF_CAT.get(Name)); } public static String quoteValue(String value) { String output = ""; output = "\'"; int len = value.length(); for(int i=0; i<len; i++) { if(value.charAt(i)=='\'') { output += "''"; } else if(value.charAt(i)=='\n') { output += " "; } else output += value.charAt(i); } output += '\''; return output; } public String toSUPPLEFormat() throws EmptyFeatureMap, ExecutionException { String cat; FeatureMap features; String output= ""; cat = this.getCategory(); features = this.getFeatures(); /* FOR POS CATEGORIES */ if (POS_CAT.contains(cat) ) { output += cat; output += "("; output += S_FORM+":"+quoteValue((String) features.get(S_FORM))+","; output += M_ROOT+":"+quoteValue((String) features.get(M_ROOT))+","; output += M_AFFIX+":"+quoteValue((String) features.get(M_AFFIX))+","; output += TEXT+":"+quoteValue((String) features.get(TEXT)); if (cat == "n" || cat == "pn") { output += ","; output += PERSON+":"+features.get(PERSON)+","; output += NUMBER+":"+features.get(NUMBER); } else if (cat == "v") { output += ","; output += PERSON+":"+features.get(PERSON)+","; output += NUMBER+":"+features.get(NUMBER); output += ","; output += TENSE+":"+features.get(TENSE)+","; output += VFORM+":"+features.get(VFORM); } else if (cat == "jj" | cat == "rb" ) { output += ","; output += DEGREE+":"+features.get(DEGREE); } else { /* nothing else */ } } else if (NE_CAT.contains(cat)) { if (cat=="list_np") { output += cat; output += "("; output += S_FORM+":"+quoteValue((String) features.get(S_FORM))+","; output += M_ROOT+":"+quoteValue((String) features.get(M_ROOT))+","; output += M_AFFIX+":"+quoteValue((String) features.get(M_AFFIX))+","; output += TEXT+":"+quoteValue((String) features.get(TEXT))+","; output += NE_TAG+":"+quoteValue((String) features.get("ne_tag"))+","; if (features.get("ne_type") != null) { output += NE_TYPE+":"+quoteValue((String) features.get("ne_type"))+","; } output += GENDER+":"+quoteValue((String) features.get("gender")); } else { Err.println("There is no category "+cat); } } output += ")"; return output; } public static FeatureMap getDefaultFeatureMap(String Name) { return (FeatureMap) DEF_CAT.get(Name); } public void emptyCategory(String Name) throws IllegalCategoryName { if (Name=="n") { this.setCategory("n"); this.setFeatures(N_POS); return; } if (Name=="v") { this.setCategory("v"); this.setFeatures(V_POS); return; } throw new IllegalCategoryName(Name+" is invalid"); } public void setFeatureValue(FeatureValue fv) { this.features.put(fv.feature,fv.value); } public void show() { System.out.println("CAT: "+name); System.out.println("FEATURES: "+features); } public void showFeatureMap() { System.out.println("Features "+features); } }