/** * POSTagger.java * * @todo: all-upper-handling * @todo: ch and e-oborotnoe handling or file conversion * @todo: list of all types and of all nests, toString of lemma and nest. This * one is based on the RussGazetteer and performs POS Tagging without * disambiguation. * <p> * Title: RussIE * </p> * <p> * Description: Russian Information Extraction based on GATE * </p> * <p> * Copyright: Copyright (c) 2003 * </p> * <p> * Company: Ontotext Lab. * </p> * @author borislav popov * @version 1.0 */ package com.ontotext.russie.morph; import gate.AnnotationSet; import gate.Factory; import gate.FeatureMap; import gate.Resource; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; import gate.creole.metadata.Sharable; import gate.util.GateRuntimeException; import gate.util.InvalidOffsetException; import gate.util.LuckyException; import gate.util.profile.Profiler; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import com.ontotext.russie.RussIEConstants; @CreoleResource(name = "Russian POS Tagger", icon = "pos-tagger", comment = "Part-of-speech tagger for Russian", helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:language-plugins:russian") @SuppressWarnings({"rawtypes","unchecked"}) public class POSTagger extends gate.creole.AbstractLanguageAnalyser implements RussIEConstants { private static final long serialVersionUID = -1392655786706987563L; /** * Debug flag */ protected static final boolean DEBUG = false; /** * Detailed Debug flag */ protected static final boolean DETAILED_DEBUG = false; public Boolean caseSensitive; /** * Used to store the annotation set currently being used for the newly * generated annotations */ protected String annotationSetName; /** the encoding of the gazetteer */ protected String encoding; /** * fireStatusDoingLookupStr */ protected static final String DOING_LOOKUP_IN = "Doing Morpho-Syntactic Type Lookup"; /** * "Reading " string */ protected static final String READING = "Reading morphology file :"; /** * a list of all the maps representing the first words, first_secind phrases, * etc. each map's value might be an ArrayList of TYpe objects specifying * categories tied to this word/phrase. */ protected List<Map> mapsList; /** * size of the mapsList */ protected int mapsListSize = 0; /** * a list of distinct references to morpho syntactic set. */ protected Set<SuffixNest> msTypeSet = null; /** The location of the main configuration file */ private URL config; public POSTagger() { } /** * Gets the morph reader. * * @param file * the morhology file * @return the morh reader */ private MorphologyReader getMorphReader(URL url) { MorphologyReader mReader = new MorphologyReader(caseSensitive.booleanValue()); mReader.setEncoding(encoding); try { mReader.load(url); } catch(IOException x) { x.printStackTrace(); throw new GateRuntimeException("Loading of the morphology failed."); } return mReader; } // getMorphReader(url) /** * Initializes the POS Tagger using the morphology. * * @throws ResourceInstantiationException * @return returns this resource */ public Resource init() throws ResourceInstantiationException { if(mapsList != null) { // this is a duplicate mapsListSize = mapsList.size(); } else { Profiler profiler = null; if(DEBUG) { profiler = new Profiler(); profiler.enableGCCalling(false); profiler.initRun("POS Tagger init()"); profiler.checkPoint("reset"); } // check if there's a list URL if(config == null) { throw new ResourceInstantiationException( "No config provided!"); } // if mapsList = new ArrayList<Map>(10); try { BufferedReader configReader = new BufferedReader(new InputStreamReader(config.openStream(), encoding)); int lemmaIdx = 0; try { // each line in config is a relative path to a morph file String configLine = null; while((configLine = configReader.readLine()) != null) { configLine = configLine.trim(); // ignore blank lines and comments if("".equals(configLine) || configLine.startsWith("#")) continue; URL morphURL = new URL(config, configLine); MorphologyReader morphRdr = getMorphReader(morphURL); Set<Lemma> lemmaSet = morphRdr.getLemmas(); Iterator<Lemma> lemmaIter = lemmaSet.iterator(); int linesCnt = lemmaSet.size(); // allocate the hashmap for the first words from the phrases mapsList.add(new HashMap(linesCnt)); mapsListSize = mapsList.size(); // allocate the category Map with optimal initial capacity & load // factor // IR - no idea what this is for, it gets overwritten every iteration // and is written to but not read within add()... msTypeSet = new HashSet<SuffixNest>(); Lemma lemma; fireStatusChanged(READING + configLine); while(lemmaIter.hasNext()) { lemma = lemmaIter.next(); fireProgressChanged(++lemmaIdx * 100 / linesCnt); this.add(lemma); } // while } } finally { configReader.close(); } fireProcessFinished(); if(DEBUG) { System.out.println("POS Tagger Is GC Enabled ? = " + profiler.isGCCallingEnabled()); profiler.checkPoint("POS Tagger init completed."); } // DEBUG if(DEBUG) { System.out.println("Starting Garbage Collection ..."); System.gc(); profiler.checkPoint("Garbage Collection finished"); } if(DEBUG) { System.out.println("\nLEMMAs COUNT -> "); System.out.println(lemmaIdx); System.out.println("\nTYPE POOL SIZE -> "); System.out.println(TypePool.size()); System.out.println("\nSUFFIX POOL SIZE -> "); System.out.println(SuffixPool.size()); } if(DETAILED_DEBUG) { // dump distinct types System.out.println("\nTYPE POOL ->"); System.out.println(TypePool.getString()); System.out.println("\nSUFFIX POOL ->"); System.out.println(SuffixPool.getString()); } } catch(Exception x) { throw new ResourceInstantiationException(x); } // catch } return this; } // Resource init() throws ResourceInstantiationException /** * Re-initialize this POS tagger by re-loading the configuration. */ public void reInit() throws ResourceInstantiationException { mapsList = null; msTypeSet = null; init(); } /** * This method runs the gazetteer. It parses the document and looks-up the * parsed phrases from the maps, in which the phrases vs. annotations are set, * in order to generate an annotation set. It assumes that all the needed * parameters are set. If they are not, an exception will be fired. */ public void execute() throws ExecutionException { Profiler profiler = null; if(DEBUG) { profiler = new Profiler(); profiler.enableGCCalling(false); profiler .initRun("POS Tagger execute over document " + document.getName()); profiler.checkPoint("reset"); } AnnotationSet annotationSet; // check the input if(document == null) { throw new ExecutionException( "No document to process!"); } // if document is null if(annotationSetName == null || annotationSetName.equals("")) annotationSet = document.getAnnotations(); else annotationSet = document.getAnnotations(annotationSetName); fireStatusChanged(DOING_LOOKUP_IN); // get the content of the document and its length String content = document.getContent().toString(); // init some params int length = content.length(); int matchedRegionEnd = 0; int matchedRegionStart = 0; // word start int iwordStart = 0; int iend = 0; int secondWordStart = 0; String phrase = ""; int mapIndex = 0; FeatureMap fm; Map currentMap = new HashMap(); // whether the current word is the first in the phrase boolean firstWord = true; boolean punctuationZone = false; // letter to number or number to letter transition zone boolean l2nORn2lZone = false; char currentChar = 0; int typeWeight = 0; // note that the code within the next cycle is overwhelmed by complexity // this said status was reached because what actually this cycle does // is ad hoc tokenization of ... guess what ... phrases // if you're still not sure this is a complex task-here are some examples : // Cable & Wireless // Moody's // C.V. // C.de R.L. // C.por A. // AG & Co KG // G.S.C. // S.A.R.L. // etc... // so in this cycle we monitor several flags and several indexes // they all should speak for themsleves e.g. boolean firstWord // boolean punctuationZone tells us whether we're within the context of a // punctuation sign (e.g. '.','&',';') // length is the length of the doc. // iwordStart is the starting index within the doc of the current 'word' // being processed // iend is the end of the current word/phrase // content is the content of the doc // mapIndex is the index of the current map, this is also the index of the // current 'part' of the phrase(parts are groups of characters separated by // whitespaces,and in some cases by punctuation) // matchedRegion(Start/End) denote the start/emd of an already looked-up // phrase( annotated ) // secondWordStart is the index of the second word in the currently // processed phrase // l2nORn2lZone indicates whether we're in a zone where // transition from letter to number or number to letter is present // e.g. GBP100 the "P1" is such a zone // typeWeight is the summed weight of the types of the last two chars while(iwordStart < length) { if(firstWord) // ultimately : starts the new lookup operation from // the second word of the previous phrase iwordStart = secondWordStart; else iwordStart = iend; // additional check if(iwordStart >= length) break; // get the beginning of the word while(iwordStart < length && (Character.isWhitespace(content.charAt(iwordStart)) || isWhiteSpacePunctuation(content .charAt(iwordStart)))) { iwordStart++; } // while find start of word // get the end of the word iend = iwordStart + 1; // tired of all these checks ? // bro why you bother to look @ this code @ all ??? if(iend >= length) break; // do while punctuation int currCharInt; do { if(punctuationZone) { currentChar = content.charAt(iend); } else while(iend < length && (Character.isLetterOrDigit(currentChar = content.charAt(iend)) || // handling for ch and etc. cyrillic letters that fail the above check ((215 == (currCharInt = currentChar)) || (currCharInt == 168) || (currCharInt == 247) || (currCharInt == 184))) || ((isDashOrQuotePunctuation(currentChar)) && (Character .isWhitespace(content.charAt(iend - 1)) || isWhiteSpacePunctuation(content .charAt(iend - 1))))) { // check whether the neighbouring chars are letter number // or number letter typeWeight = Character.getType(currentChar) + Character.getType(content.charAt(iend - 1)); if(l2nORn2lZone = (typeWeight == 10) || (typeWeight == 11)) { break; } // if l2nORn2lZone iend++; } // while find end of word // build phrase if(firstWord) { phrase = content.substring(iwordStart, iend); // maintain the case of a token without whitespaces // but beginning with punctuation e.g. : "The or &BT // in this case we should set the start of the next word to // iwordStart + 1; // it is the same when when l2nORn2lZone = true if((isDashOrQuotePunctuation(content.charAt(iwordStart))) || l2nORn2lZone) { secondWordStart = iwordStart + 1; } else { secondWordStart = iend; } // else matchedRegionStart = iwordStart; mapIndex = 0; firstWord = false; } else { if(punctuationZone || l2nORn2lZone) { // close a punctuation zone or a l2nORn2lZone ? if(Character.isWhitespace(currentChar) || isWhiteSpacePunctuation(currentChar)) { punctuationZone = false; l2nORn2lZone = false; mapIndex++; break; } // if it's whitespace else { phrase = phrase + currentChar; iend++; } // still in punctuation zone } else { phrase = phrase + ' ' + content.substring(iwordStart, iend); } // not a punctuation zone neither l2nORn2lZone } // not a first word // determine punctuatuion zone if(isDashOrQuotePunctuation(currentChar) && !Character.isWhitespace(content.charAt(iend - 1))) { punctuationZone = true; firstWord = false; } // if punctuation // determine l2nORn2lZone typeWeight = Character.getType(currentChar) + Character.getType(content.charAt(iend - 1)); if((typeWeight == 10) || (typeWeight == 11)) { l2nORn2lZone = true; firstWord = false; } // if typeWeight ... // check mapindex's validity if(mapIndex >= mapsListSize) { firstWord = true; punctuationZone = false; l2nORn2lZone = false; continue; } // if mapindex out of bounds // try to find it in the dark cave ... currentMap = mapsList.get(mapIndex); Map<String, Set> lemmaVsTypes = this.lookup(phrase, currentMap); if(lemmaVsTypes != null && lemmaVsTypes.size() > 0) { matchedRegionEnd = iend; // generate lookups for the phrase so far { Iterator<String> lemmaIter = lemmaVsTypes.keySet().iterator(); String lemma; Set typeSet; String type; Iterator typeIter; while(lemmaIter.hasNext()) { lemma = lemmaIter.next(); typeSet = lemmaVsTypes.get(lemma); typeIter = typeSet.iterator(); while(typeIter.hasNext()) { type = typeIter.next().toString(); fm = Factory.newFeatureMap(); fm.put(FEATURE_MSD_TYPE, type); fm.put(FEATURE_LEMMA, lemma); try { annotationSet.add(new Long(matchedRegionStart), new Long( matchedRegionEnd), TYPE_MSD, fm); } catch(InvalidOffsetException ioe) { throw new LuckyException(ioe.toString()); } // catchx } // while types }// while(lemmaIter.hasNext()) } // generate MSD for the phraseso far } else { if(!punctuationZone && !l2nORn2lZone) { // the map doesn't contain the key iend = secondWordStart; firstWord = true; continue; } // if critical iteration } // else // jump to the next map only if not within a punctuation context // neither in a l2nORn2l Zone if(!punctuationZone && !l2nORn2lZone) ++mapIndex; // if the current map index reached the size of the map list if(mapIndex >= mapsListSize) { firstWord = true; punctuationZone = false; continue; } // if mapIndex out of bounds } while((punctuationZone || l2nORn2lZone) && iend < length); // if end of boundaries for iend reached then set exclusively firstWord // this will cause the program to start from secondWordStart or // matchedRegionEnd indexes. if(iend >= length || iwordStart >= length) { iend = secondWordStart; firstWord = true; // last change punctuationZone = false; } // if iend out of boundaries } // while within content if(DEBUG) { profiler.checkPoint("POS Tagger init completed."); } // DEBUG fireProcessFinished(); fireStatusChanged("POS Tagger processing finished!"); } // execute () public Set lookup(String wordForm) { Set result = null; for(int li = 0; li < mapsListSize; li++) { Map list = mapsList.get(li); if(list.containsKey(wordForm)) { ArrayList lookupList = (ArrayList)list.get(wordForm); if(lookupList != null && lookupList.size() > 0) { result = new HashSet(lookupList); break; } } } // for lists return result; } public boolean remove(String wordForm) { boolean isRemoved = false; for(int i = 0; i < mapsListSize; i++) { Map map = mapsList.get(i); if(map.containsKey(wordForm)) { map.remove(wordForm); isRemoved = true; break; } } // for lists return isRemoved; } /** * Adds to the POS Tagger model another wordForm with its type. * * @param wordForm * the word-form * @param lemma * the lemma to be added * @return true if successful */ public boolean add(Lemma lemma) { boolean isAdded = false; SuffixNest nest = lemma.getSuffixNest(); String root = lemma.getRoot(); @SuppressWarnings("unused") String mainFormSuffix = lemma.getSuffix(lemma.getMainFormType()); if(!caseSensitive.booleanValue()) { root = root.toLowerCase(); } // ALL-UPPER-CASE SUPPORT // String upper = root.toUpperCase(); // if (!upper.equals(root)) { // this.add(upper,type); // } // avoid endless recursion // category key Set<SuffixNest> key = new HashSet<SuffixNest>(1); // add the lookup to the current key key.add(nest); // add the lookup to the category list msTypeSet.add(nest); // init some params String line = root; int mapIndex = -1; String word = null; Set<SuffixNest> oldKey = null; Map<String, Set> currentMap = null; int length = 0; mapIndex = -1; line = line.trim(); length = line.length(); for(int lineIndex = 0; lineIndex < length; lineIndex++) { if((lineIndex + 1 == length) || Character.isWhitespace(line.charAt(lineIndex)) || isWhiteSpacePunctuation(line.charAt(lineIndex))) { // if not whitespace but end of line then the index should equal the // length if(lineIndex + 1 == length) lineIndex = length; // get the word word = line.substring(0, lineIndex).trim(); // if the map doesn't exist : create it ++mapIndex; if(mapsListSize <= mapIndex) { mapsList.add(new HashMap()); mapsListSize++; } // if the map doesn't exist // get the map and add the word to the map currentMap = (mapsList.get(mapIndex)); // try to get the current word // if there isn't such a word : add it with null key. if(!currentMap.containsKey(word)) { currentMap.put(word, null); } // add the word } // if whitespace } // for line iterate if(word == null) { word = root; // if the map doesn't exist : create it ++mapIndex; if(mapsListSize <= mapIndex) { mapsList.add(new HashMap()); mapsListSize++; } // if the map doesn't exist currentMap = (mapsList.get(mapIndex)); } try { // !!! put the category key in the last map oldKey = currentMap.get(word); } catch(Exception x) { x.printStackTrace(); System.out.println("root = " + word); } if(null == oldKey) { oldKey = new HashSet<SuffixNest>(1); oldKey.add(nest); currentMap.put(word, oldKey); isAdded = true; } else { if(!oldKey.contains(nest)) { oldKey.add(nest); } currentMap.put(word, oldKey); isAdded = true; } // else return isAdded; } // add @CreoleParameter(comment = "Location of configuration file listing " + "morphology files to be loaded", defaultValue = "resources/morphology/main.conf") public void setConfig(URL config) { this.config = config; } public URL getConfig() { return config; } // getConfig /** * Sets the AnnotationSet that will be used at the next run for the newly * produced annotations. */ @RunTime @Optional @CreoleParameter(comment = "The annotation set name to use") public void setAnnotationSetName(String newAnnotationSetName) { annotationSetName = newAnnotationSetName; } /** * Gets the AnnotationSet that will be used at the next run for the newly * produced annotations. */ public String getAnnotationSetName() { return annotationSetName; } @CreoleParameter(comment = "The encoding used for reading the config and " + "morphology files", defaultValue = "UTF-8") public void setEncoding(String newEncoding) { encoding = newEncoding; } public String getEncoding() { return encoding; } /** * Find the list of mathcing types and mainforms for the current phrase and * map * * @param phrase * @param map * @return the map of main form vs matching types for the current phrase and * map */ private Map<String, Set> lookup(String phrase, Map map) { if(DETAILED_DEBUG) { System.out.println("phrase -> " + phrase); } if(!caseSensitive.booleanValue()) { phrase = phrase.toLowerCase(); } Map<String, Set> lemmaVsTypes = new HashMap<String, Set>(); Set newTypes; Set nests; Iterator ni; String suffix = ""; SuffixNest nest; String lemma; // temporary mediator Set types; while(phrase.length() > 0) { if(map.containsKey(phrase)) { nests = (Set)map.get(phrase); ni = nests.iterator(); while(ni.hasNext()) { nest = (SuffixNest)ni.next(); if(nest != null) { newTypes = nest.getType(suffix); lemma = phrase + nest.getMainFormSuffix(); if(newTypes != null) { types = lemmaVsTypes.get(lemma); if(types == null) { lemmaVsTypes.put(lemma, newTypes); } else { types.addAll(types); } } else { if(DETAILED_DEBUG) { System.out.println("\nUNRECOGNIZED PHRASE - > " + phrase + suffix); System.out.println("SUFFIX - > " + suffix); System.out.println("NEST - > " + nest); } // detailed debug } // dump the unrecognized phrase with the nest content } // not null nest } // while nests } // if phrase contained suffix = phrase.substring(phrase.length() - 1) + suffix; phrase = phrase.substring(0, phrase.length() - 1); } // while phrase - try lookups return lemmaVsTypes; } // lookup (phrase,map) private boolean isDashOrQuotePunctuation(char ch) { int type = Character.getType(ch); if(Character.DASH_PUNCTUATION == type || Character.INITIAL_QUOTE_PUNCTUATION == type || Character.FINAL_QUOTE_PUNCTUATION == type || ch == '.') { return true; } return false; } // isDashOrQuotePunctuation(ch) private boolean isWhiteSpacePunctuation(char ch) { int type = Character.getType(ch); if((Character.OTHER_PUNCTUATION == type || Character.CONNECTOR_PUNCTUATION == type || Character.START_PUNCTUATION == type || Character.END_PUNCTUATION == type) && (ch != '.')) { return true; } return false; } // isWhiteSpacePunctuation(ch) @CreoleParameter(comment = "Should this POS Tagger differentiate on case", defaultValue = "true") public void setCaseSensitive(Boolean newCaseSensitive) { caseSensitive = newCaseSensitive; } public Boolean getCaseSensitive() { return caseSensitive; } /** * For internal use by the duplication mechanism. */ @Sharable public void setMapsList(List<Map> mapsList) { this.mapsList = mapsList; } /** * For internal use by the duplication mechanism. */ public List<Map> getMapsList() { return mapsList; } /** * For internal use by the duplication mechanism. */ @Sharable public void setMsTypeSet(Set<SuffixNest> msTypeSet) { this.msTypeSet = msTypeSet; } /** * For internal use by the duplication mechanism. */ public Set<SuffixNest> getMsTypeSet() { return msTypeSet; } } // class POSTagger