/** * RussGazetteer.java This one is based on the HashGazetteer with additional * features : ALL-CAPS recognition; multiple overlapping lookups generation. * <p> * Title: RussIE * </p> * <p> * Description: Russian Information Extraction based on GATE * </p> * <p> * Copyright: Copyright (c) 2003 * </p> * <p> * Company: Ontotext Lab. * </p> * * @author unascribed * @version 1.0 This file is a part of the processing resources provided by * OntoText Lab. a part of Sirma Artifical Intelligence Labs. the * software and this file are licenced. A copy of the licence is * included in the distribution in the file licence.ontotext.html, and * is also available at * http://www.ontotext.com/gate/licence.ontotext.html borislav popov, * 08/11/2001 $Id: RussGazetteer.java 16342 2013-03-13 17:11:12Z ian $ */ package com.ontotext.russie.gazetteer; import gate.AnnotationSet; import gate.Factory; import gate.FeatureMap; import gate.Resource; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.creole.gazetteer.AbstractGazetteer; import gate.creole.gazetteer.GazetteerException; import gate.creole.gazetteer.GazetteerList; import gate.creole.gazetteer.LinearDefinition; import gate.creole.gazetteer.LinearNode; import gate.creole.gazetteer.Lookup; import gate.creole.gazetteer.MappingNode; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.HiddenCreoleParameter; import gate.creole.metadata.RunTime; import gate.creole.metadata.Sharable; import gate.util.InvalidOffsetException; import gate.util.LuckyException; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import com.ontotext.russie.RussIEConstants; /** * RussGazetteer.java * * @author borislav popov * @version 1.0 */ @CreoleResource(name = "Russian Gazetteer", icon = "shefGazetteer", comment = "Customised version of the hash gazetteer", helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:language-plugins:russian") @SuppressWarnings({"rawtypes","unchecked"}) public class RussGazetteer extends AbstractGazetteer implements RussIEConstants { private static final long serialVersionUID = -5174914553200046785L; /** * Debug flag */ protected static final boolean DEBUG = false; /** * majorType feature static representation */ protected static final String MAJOR_TYPE_STR = "majorType"; /** * minorType feature static representation */ protected static final String MINOR_TYPE_STR = "minorType"; /** * language feature static representation */ protected static final String LANGUAGE = "language"; /** * "Lookup" static */ protected static final String LOOKUP = "Lookup"; /** * fireStatusDoingLookupStr */ protected static final String DOING_LOOKUP_IN = "Doing lookup in "; /** * empty string */ protected static final String EMPTY_STR = ""; /** * dots ... string */ protected static final String DOTS = "..."; /** * \\ string */ protected static final String SLASH_SLASH = "\\"; /** * "Reading " string */ protected static final String READING = "Reading "; /** * A dot string = "." */ protected static final String DOT = "."; /** a map of nodes vs gaz lists */ protected Map listsByNode; /** * a list of all the maps representing the first words, first_secind phrases, * etc. each map's value might be an ArrayList of Lookup objects specifying * categories tied to this word/phrase. */ protected List<Map> mapsList; /** * size of the mapsList */ protected int mapsListSize = 0; /** * a list of references to Lookup objs representing the categories. */ protected ArrayList<Lookup> categoryList = null; /** * Builds a gazetter using the default lists from the GATE resources {see * init()} */ public RussGazetteer() { } /** * Does the actual loading and parsing of the lists. This method must be * called before the gazetteer can be used. * * @throws ResourceInstantiationException * @return returns this resource */ public Resource init() throws ResourceInstantiationException { if(mapsList != null) { // this is a duplicate mapsListSize = mapsList.size(); } else { mapsList = new ArrayList<Map>(10); // check if there's a list URL if(listsURL == null) { throw new ResourceInstantiationException( "No URL provided for gazetteer creation!"); } // if try { definition = new LinearDefinition(); definition.setURL(listsURL); definition.load(); int linesCnt = definition.size(); listsByNode = definition.loadLists(); // allocate the hashmap for the first words from the phrases mapsList.add(new HashMap(linesCnt * 10)); mapsListSize = mapsList.size(); // allocate the category Map with optimal initial capacity & load factor categoryList = new ArrayList<Lookup>(linesCnt + 1); Iterator<LinearNode> inodes = definition.iterator(); LinearNode node; int nodeIdx = 0; while(inodes.hasNext()) { node = inodes.next(); fireStatusChanged(READING + node.toString()); fireProgressChanged(++nodeIdx * 100 / linesCnt); readList(node, true); } // while fireProcessFinished(); } catch(Exception x) { throw new ResourceInstantiationException(x); } // catch } return this; } // Resource init()throws ResourceInstantiationException /** * Re-initialize this gazetteer by re-loading the configuration. */ public void reInit() throws ResourceInstantiationException { mapsList = null; categoryList = null; init(); } /** * gets the phrases/lines of a gazetteer list stores them in the maps opposed * to a lookup. * * @param node * a linear node(line from the linear definition) * @param add * @add if <b>true</b> will add the phrases found in the list to the ones * recognised by this gazetter, if <b>false</b> the phrases found in the * list will be removed from the list of phrases recognised by this * gazetteer. * @throws FileNotFoundException * @throws IOException * @throws GazetteerException * @return void */ void readList(LinearNode node, boolean add) throws GazetteerException { String listName, majorType, minorType, languages; if(null == node) { throw new GazetteerException(" LinearNode node is null "); } listName = node.getList(); majorType = node.getMajorType(); minorType = node.getMinorType(); languages = node.getLanguage(); GazetteerList gazList = (GazetteerList)listsByNode.get(node); if(null == gazList) { throw new GazetteerException( "gazetteer list not found by node"); } // create a lookup object for the current category Lookup lookup = new Lookup(listName, majorType, minorType, languages); if(null != mappingDefinition) { MappingNode mnode = mappingDefinition.getNodeByList(listName); if(null != mnode) { lookup.oClass = mnode.getClassID(); lookup.ontology = mnode.getOntologyID(); } }// if mapping def lookup.list = listName; Iterator iline = gazList.iterator(); String line; // add the following lines to the gazetteer if(add) { while(iline.hasNext()) { line = iline.next().toString(); this.add(line, lookup); } // while there are lines to be processed // remove the following lines from the gazetteer } else { while(iline.hasNext()) { // currently no implementation of remove line = iline.next().toString(); } // while there are lines to be processed } // else remove } // void readList(String listDesc) /** * This method runs the gazetteer. It parses the document and looks-up the * parsed phrases from the maps, in which the phrases vs. annotations are set, * in order to generate an annotation set. It assumes that all the needed * parameters are set. If they are not, an exception will be fired. */ public void execute() throws ExecutionException { AnnotationSet annotationSet; // check the input if(document == null) { throw new ExecutionException( "No document to process!"); } // if document is null if(annotationSetName == null || annotationSetName.equals(EMPTY_STR)) annotationSet = document.getAnnotations(); else annotationSet = document.getAnnotations(annotationSetName); fireStatusChanged(DOING_LOOKUP_IN + document.getSourceUrl().getFile() + DOTS); // get the content of the document and its length String content = document.getContent().toString(); // init some params int length = content.length(); int matchedRegionEnd = 0; int matchedRegionStart = 0; // word start int iwordStart = 0; int iend = 0; int secondWordStart = 0; String phrase = EMPTY_STR; int mapIndex = 0; FeatureMap fm; Map currentMap = new HashMap(); List currentLookup = null; // whether the current word is the first in the phrase boolean firstWord = true; boolean punctuationZone = false; // letter to number or number to letter transition zone boolean l2nORn2lZone = false; char currentChar = 0; int typeWeight = 0; // note that the code within the next cycle is overwhelmed by complexity // this said status was reached because what actually this cycle does // is ad hoc tokenization of ... guess what ... phrases // if you're still not sure this is a complex task-here are some examples : // Cable & Wireless // Moody's // C.V. // C.de R.L. // C.por A. // AG & Co KG // G.S.C. // S.A.R.L. // etc... // so in this cycle we monitor several flags and several indexes // they all should speak for themsleves e.g. boolean firstWord // boolean punctuationZone tells us whether we're within the context of a // punctuation sign (e.g. '.','&',';') // length is the length of the doc. // iwordStart is the starting index within the doc of the current 'word' // being processed // iend is the end of the current word/phrase // content is the content of the doc // mapIndex is the index of the current map, this is also the index of the // current 'part' of the phrase(parts are groups of characters separated by // whitespaces,and in some cases by punctuation) // matchedRegion(Start/End) denote the start/emd of an already looked-up // phrase( annotated ) // secondWordStart is the index of the second word in the currently // processed phrase // l2nORn2lZone indicates whether we're in a zone where // transition from letter to number or number to letter is present // e.g. GBP100 the "P1" is such a zone // typeWeight is the summed weight of the types of the last two chars while(iwordStart < length) { if(firstWord) // ultimately : starts the new lookup operation from // the second word of the previous phrase iwordStart = secondWordStart; else iwordStart = iend; // additional check if(iwordStart >= length) break; // get the beginning of the word while(iwordStart < length && (Character.isWhitespace(content.charAt(iwordStart)) || isWhiteSpacePunctuation(content .charAt(iwordStart)))) { iwordStart++; } // while find start of word // get the end of the word iend = iwordStart + 1; // tired of all these checks ? // bro why you bother to look @ this code @ all ??? if(iend >= length) break; // do while punctuation do { int currCharInt; if(punctuationZone) { currentChar = content.charAt(iend); } else while(iend < length && (Character.isLetterOrDigit(currentChar = content.charAt(iend)) || // handling for ch and etc. cyrillic letters that fail the above check ((215 == (currCharInt = currentChar)) || (currCharInt == 168) || (currCharInt == 247) || (currCharInt == 184))) || ((isDashOrQuotePunctuation(currentChar)) && (Character .isWhitespace(content.charAt(iend - 1)) || isWhiteSpacePunctuation(content .charAt(iend - 1))))) { // check whether the neighbouring chars are letter number // or number letter typeWeight = Character.getType(currentChar) + Character.getType(content.charAt(iend - 1)); if(l2nORn2lZone = (typeWeight == 10) || (typeWeight == 11)) { break; } // if l2nORn2lZone iend++; } // while find end of word // build phrase if(firstWord) { phrase = content.substring(iwordStart, iend); // maintain the case of a token without whitespaces // but beginning with punctuation e.g. : "The or &BT // in this case we should set the start of the next word to // iwordStart + 1; // it is the same when when l2nORn2lZone = true if((isDashOrQuotePunctuation(content.charAt(iwordStart))) || l2nORn2lZone) { secondWordStart = iwordStart + 1; } else { secondWordStart = iend; } // else matchedRegionStart = iwordStart; mapIndex = 0; firstWord = false; } else { if(punctuationZone || l2nORn2lZone) { // close a punctuation zone or a l2nORn2lZone ? if(Character.isWhitespace(currentChar) || isWhiteSpacePunctuation(currentChar)) { punctuationZone = false; l2nORn2lZone = false; mapIndex++; break; } // if it's whitespace else { phrase = phrase + currentChar; iend++; } // still in punctuation zone } else { phrase = phrase + ' ' + content.substring(iwordStart, iend); } // not a punctuation zone neither l2nORn2lZone } // not a first word // determine punctuatuion zone if(isDashOrQuotePunctuation(currentChar) && !Character.isWhitespace(content.charAt(iend - 1))) { firstWord = false; } // if punctuation // determine l2nORn2lZone typeWeight = Character.getType(currentChar) + Character.getType(content.charAt(iend - 1)); if((typeWeight == 10) || (typeWeight == 11)) { l2nORn2lZone = true; firstWord = false; } // if typeWeight ... // check mapindex's validity if(mapIndex >= mapsListSize) { firstWord = true; punctuationZone = false; l2nORn2lZone = false; continue; } // if mapindex out of bounds // try to find it in the dark cave ... currentMap = mapsList.get(mapIndex); // if found in current map then set matchedRegion phrase = trunxSuffixVowelsFromPhrase(phrase); if(currentMap.containsKey(phrase)) { currentLookup = (ArrayList)currentMap.get(phrase); if(null != currentLookup) { matchedRegionEnd = iend; // generate lookups for the phrase so far { Iterator lookupIter = currentLookup.iterator(); Lookup lookup; while(lookupIter.hasNext()) { lookup = (Lookup)lookupIter.next(); fm = Factory.newFeatureMap(); fm.put(MAJOR_TYPE_STR, lookup.majorType); if(null != lookup.minorType) fm.put(MINOR_TYPE_STR, lookup.minorType); if(null != lookup.languages) fm.put(LANGUAGE, lookup.languages); try { annotationSet.add(new Long(matchedRegionStart), new Long( matchedRegionEnd), LOOKUP_ANNOTATION_TYPE, fm); } catch(InvalidOffsetException ioe) { throw new LuckyException(ioe.toString()); } // catchx }// while(lookupIter.hasNext()) } // generate lookups for the phraseso far } // if not null lookup } else { if(!punctuationZone && !l2nORn2lZone) { // the map doesn't contain the key iend = secondWordStart; firstWord = true; continue; } // if critical iteration } // else // jump to the next map only if not within a punctuation context // neither in a l2nORn2l Zone if(!punctuationZone && !l2nORn2lZone) ++mapIndex; // if the current map index reached the size of the map list if(mapIndex >= mapsListSize) { firstWord = true; punctuationZone = false; continue; } // if mapIndex out of bounds } while((punctuationZone || l2nORn2lZone) && iend < length); // if end of boundaries for iend reached then set exclusively firstWord // this will cause the program to start from secondWordStart or // matchedRegionEnd indexes. if(iend >= length || iwordStart >= length) { iend = secondWordStart; firstWord = true; // last change punctuationZone = false; } // if iend out of boundaries } // while within content fireProcessFinished(); fireStatusChanged("Gazetteer processing finished!"); } // execute () public Set lookup(String singleItem) { Set result = null; for(int li = 0; li < mapsListSize; li++) { Map list = mapsList.get(li); if(list.containsKey(singleItem)) { ArrayList lookupList = (ArrayList)list.get(singleItem); if(lookupList != null && lookupList.size() > 0) { result = new HashSet(lookupList); break; } } } // for lists return result; } public boolean remove(String singleItem) { boolean isRemoved = false; for(int i = 0; i < mapsListSize; i++) { Map map = mapsList.get(i); if(map.containsKey(singleItem)) { map.remove(singleItem); isRemoved = true; break; } } // for lists return isRemoved; } public boolean add(String singleItem, Lookup lookup) { // ALL-UPPER-CASE SUPPORT String upper = singleItem.toUpperCase(); if(!upper.equals(singleItem)) { this.add(upper, lookup); } // avoid endless recursion // if the item is not with first capital - make it with first capital and // add it if(singleItem.length() > 1) { String firstLetter = singleItem.substring(0, 1); if(!firstLetter.equals(firstLetter.toUpperCase())) { this.add(firstLetter.toUpperCase() + singleItem.substring(1), lookup); } } // stems the word (or words in phrase) and adds it as a new gaz entry if it // differs from the currently being added phrase/word { String stem = trunxSuffixVowelsFromPhrase(singleItem); if(!stem.equals(singleItem)) { this.add(stem, lookup); } } // category key ArrayList<Lookup> key = new ArrayList<Lookup>(1); // add the lookup to the current key key.add(lookup); // add the lookup to the category list categoryList.add(lookup); // init some params String line = singleItem; int mapIndex = -1; String word = null; List<Lookup> oldKey = null; Map<String, List<Lookup>> currentMap = new HashMap<String, List<Lookup>>(); int length = 0; line = singleItem; mapIndex = -1; line = line.trim(); length = line.length(); for(int lineIndex = 0; lineIndex < length; lineIndex++) { if((lineIndex + 1 == length) || (Character.isWhitespace(line.charAt(lineIndex)))) { // if not whitespace but end of line then the index should equal the // length if(lineIndex + 1 == length) lineIndex = length; // get the word word = line.substring(0, lineIndex).trim(); // if the map doesn't exist : create it ++mapIndex; if(mapsListSize <= mapIndex) { mapsList.add(new HashMap()); mapsListSize++; } // if the map doesn't exist // get the map and add the word to the map currentMap = (mapsList.get(mapIndex)); // try to get the current word // if there isn't such a word : add it with null key. if(!currentMap.containsKey(word)) { currentMap.put(word, null); } // add the word } // if whitespace } // for line iterate // !!! put the category key in the last map oldKey = currentMap.get(word); if(null == oldKey) { currentMap.put(word, key); } else { // merge the two arraylists // and check to avoid duplicity of lookups // note that key's length is 1 ArrayList<Lookup> mergedKey = new ArrayList<Lookup>(oldKey); boolean duplicity = false; for(int i = 0; i < oldKey.size(); i++) { duplicity = mergedKey.get(i).equals(key.get(0)); } // for i if(!duplicity) mergedKey.add(key.get(0)); // put the merged key in the map currentMap.put(word, mergedKey); } // else return true; } // add private boolean isDashOrQuotePunctuation(char ch) { int type = Character.getType(ch); if(Character.DASH_PUNCTUATION == type || Character.INITIAL_QUOTE_PUNCTUATION == type || Character.FINAL_QUOTE_PUNCTUATION == type || ch == '.') { return true; } return false; } // isDashOrQuotePunctuation(ch) private boolean isWhiteSpacePunctuation(char ch) { int type = Character.getType(ch); if((Character.OTHER_PUNCTUATION == type || Character.CONNECTOR_PUNCTUATION == type || Character.START_PUNCTUATION == type || Character.END_PUNCTUATION == type) && (ch != '.')) { return true; } return false; } // isWhiteSpacePunctuation(ch) public String trunxSuffixVowelsFromPhrase(String phrase) { String line = phrase; int length = phrase.length(); //String word; StringBuffer stem = new StringBuffer(); int lastWordEnd = 0; String justWord; for(int lineIndex = 0; lineIndex < length; lineIndex++) { if((lineIndex + 1 == length) || (Character.isWhitespace(line.charAt(lineIndex)))) { // if not whitespace but end of line then the index should equal the // length if(lineIndex + 1 == length) lineIndex = length; // get the word //word = line.substring(0, lineIndex).trim(); justWord = line.substring(lastWordEnd, lineIndex).trim(); stem.append(trunxSuffixVowelsFromWord(justWord)).append(" "); lastWordEnd = lineIndex; } // if whitespace } // for line iterate // if (phrase.length() != stem.toString().trim().length() ) { // System.out.println(phrase+" -> "+stem.toString()); // } return stem.toString().trim(); } // trunxSuffixVowelsFromPhrase() public String trunxSuffixVowelsFromWord(String word) { int len = word.length(); String lastCh; int trunxCount = 0; while(len > minWordLength && trunxCount < maxTruncatedVowels) { lastCh = word.substring(len - 1); if(SET_OF_VOWELS.contains(lastCh)) { word = word.substring(0, len - 1); trunxCount++; } else { // sufix2l = word.substring(len-2); // if (SET_OF_CONSONANT_SUFFIXES.contains(sufix2l)){ // word = word.substring(0,len-2); // } return word; } // not a vowel len--; } return word; } // trunxSuffixVowelsFromWord() @HiddenCreoleParameter @CreoleParameter(comment="not supported by this gazetteer", defaultValue="true") public void setCaseSensitive(Boolean newCaseSensitive) { caseSensitive = newCaseSensitive; } @HiddenCreoleParameter @RunTime @CreoleParameter(comment="not supported by this gazetteer", defaultValue="true") public void setLongestMatchOnly(Boolean longestMatchOnly) { this.longestMatchOnly = longestMatchOnly; } @HiddenCreoleParameter @RunTime @CreoleParameter(comment="not supported by this gazetteer", defaultValue="true") public void setWholeWordsOnly(Boolean wholeWordsOnly) { this.wholeWordsOnly = wholeWordsOnly; } /** * For internal use by the duplication mechanism. */ @Sharable public void setMapsList(List<Map> mapsList) { this.mapsList = mapsList; } /** * For internal use by the duplication mechanism. */ public List<Map> getMapsList() { return mapsList; } /** * For internal use by the duplication mechanism. */ @Sharable public void setCategoryList(ArrayList<Lookup> categoryList) { this.categoryList = categoryList; } /** * For internal use by the duplication mechanism. */ public ArrayList<Lookup> getCategoryList() { return categoryList; } } // class Russ Gazetteer