package com.ontotext.russie; import java.util.Arrays; import java.util.HashSet; import java.util.Set; /** * RussIEConstants.java * This interface contains a collection of RussIE constants. * <p>Title: RussIE</p> * <p>Description: Russian Information Extraction based on GATE</p> * <p>Copyright: Copyright (c) 2003</p> * <p>Company: Ontotext Lab.</p> * @author borislav popov * @version 1.0 */ public interface RussIEConstants { public final static String RUSSIE_VERSION = "0.1.07.26"; /**Morphology file extension*/ final static String EXT_MORPH_FILE = ".pl"; /**Inflectional Gazetteer Resources extension*/ final static String EXT_INFL_GAZ_FILE = ".infl"; final static String TYPE_LOCATION = "Location"; final static String TYPE_MSD = "MSD"; final static String TYPE_TOKEN = "Token"; final static String TYPE_LOOKUP = "Lookup"; final static String TYPE_PERSON = "Person"; final static String TYPE_DATE = "Date"; final static String TYPE_NUMBER = "Number"; final static String TYPE_ORGANIZATION = "Organization"; /*FEATURES*/ final static String MAJOR_TYPE = "majorType"; final static String FEATURE_OCCURANCE = "occurance"; /**MSD annotation type feature*/ final static String FEATURE_MSD_TYPE = "type"; /** POS annotation type feature in Token */ final static String FEATURE_POS_TYPE = "category"; /**MSD annotation lemma-main form feature*/ final static String FEATURE_LEMMA = "lemma"; /*MAjor Types*/ final static String MAJOR_TYPE_PERSON_FIRST = "person_first"; final static String MAJOR_TYPE_PERSON_FULL = "person_full"; final static String MAJOR_TYPE_PERSON_SURNAME = "surname"; final static String MAJOR_TYPE_LOC = "location"; /*Syllables */ /* Vowels */ final static String [] arrVowels = { "\u0430","\u0410", "\u0435","\u0415", "\u0438","\u0418", "\u0439","\u0419", "\u043E","\u041E", "\u0443","\u0423", "\u044A","\u042A", "\u044C","\u042C", "\u044B","\u042B", "\u0451","\u0401", "\u044D","\u042D", "\u044E","\u042E", "\u044F","\u042F" }; static final Set<String> SET_OF_VOWELS = new HashSet<String>(Arrays.asList(arrVowels)); /*Consonant Suffixes*/ final static String [] arrConsonantSuffixes = { // om "\u043E\u043C","\u041E\u041C", // em "\u0435\u043C","\u0415\u041C", // e with 2 points- m "\u0451\u043C","\u0401\u041C", // ~ih "\u044B\u0445","\u042B\u0425", // ~im "\u044B\u043C","\u042B\u041C", // ov "\u043E\u0432","\u041E\u0412", // am "\u0430\u043C","\u0410\u041C", // ah "\u0430\u0445","\u0410\u0425", // iam "\u044F\u043C","\u042F\u041C", // iah "\u044F\u0445","\u042F\u0425", }; static Set<String> SET_OF_CONSONANT_SUFFIXES = new HashSet<String>(Arrays.asList(arrConsonantSuffixes)); /*Stemming Limitations*/ static int minWordLength = 2; static int maxTruncatedVowels = 2; } // class RussIEConstants