/**
* RussGazetteer.java This one is based on the HashGazetteer with additional
* features : ALL-CAPS recognition; multiple overlapping lookups generation.
* <p>
* Title: RussIE
* </p>
* <p>
* Description: Russian Information Extraction based on GATE
* </p>
* <p>
* Copyright: Copyright (c) 2003
* </p>
* <p>
* Company: Ontotext Lab.
* </p>
*
* @author unascribed
* @version 1.0 This file is a part of the processing resources provided by
* OntoText Lab. a part of Sirma Artifical Intelligence Labs. the
* software and this file are licenced. A copy of the licence is
* included in the distribution in the file licence.ontotext.html, and
* is also available at
* http://www.ontotext.com/gate/licence.ontotext.html borislav popov,
* 08/11/2001 $Id: RussGazetteer.java 16342 2013-03-13 17:11:12Z ian $
*/
package com.ontotext.russie.gazetteer;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.gazetteer.AbstractGazetteer;
import gate.creole.gazetteer.GazetteerException;
import gate.creole.gazetteer.GazetteerList;
import gate.creole.gazetteer.LinearDefinition;
import gate.creole.gazetteer.LinearNode;
import gate.creole.gazetteer.Lookup;
import gate.creole.gazetteer.MappingNode;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.HiddenCreoleParameter;
import gate.creole.metadata.RunTime;
import gate.creole.metadata.Sharable;
import gate.util.InvalidOffsetException;
import gate.util.LuckyException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.ontotext.russie.RussIEConstants;
/**
* RussGazetteer.java
*
* @author borislav popov
* @version 1.0
*/
@CreoleResource(name = "Russian Gazetteer", icon = "shefGazetteer",
comment = "Customised version of the hash gazetteer",
helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:language-plugins:russian")
@SuppressWarnings({"rawtypes","unchecked"})
public class RussGazetteer extends AbstractGazetteer implements RussIEConstants {
private static final long serialVersionUID = -5174914553200046785L;
/**
* Debug flag
*/
protected static final boolean DEBUG = false;
/**
* majorType feature static representation
*/
protected static final String MAJOR_TYPE_STR = "majorType";
/**
* minorType feature static representation
*/
protected static final String MINOR_TYPE_STR = "minorType";
/**
* language feature static representation
*/
protected static final String LANGUAGE = "language";
/**
* "Lookup" static
*/
protected static final String LOOKUP = "Lookup";
/**
* fireStatusDoingLookupStr
*/
protected static final String DOING_LOOKUP_IN = "Doing lookup in ";
/**
* empty string
*/
protected static final String EMPTY_STR = "";
/**
* dots ... string
*/
protected static final String DOTS = "...";
/**
* \\ string
*/
protected static final String SLASH_SLASH = "\\";
/**
* "Reading " string
*/
protected static final String READING = "Reading ";
/**
* A dot string = "."
*/
protected static final String DOT = ".";
/** a map of nodes vs gaz lists */
protected Map listsByNode;
/**
* a list of all the maps representing the first words, first_secind phrases,
* etc. each map's value might be an ArrayList of Lookup objects specifying
* categories tied to this word/phrase.
*/
protected List<Map> mapsList;
/**
* size of the mapsList
*/
protected int mapsListSize = 0;
/**
* a list of references to Lookup objs representing the categories.
*/
protected ArrayList<Lookup> categoryList = null;
/**
* Builds a gazetter using the default lists from the GATE resources {see
* init()}
*/
public RussGazetteer() {
}
/**
* Does the actual loading and parsing of the lists. This method must be
* called before the gazetteer can be used.
*
* @throws ResourceInstantiationException
* @return returns this resource
*/
public Resource init() throws ResourceInstantiationException {
if(mapsList != null) {
// this is a duplicate
mapsListSize = mapsList.size();
} else {
mapsList = new ArrayList<Map>(10);
// check if there's a list URL
if(listsURL == null) { throw new ResourceInstantiationException(
"No URL provided for gazetteer creation!"); } // if
try {
definition = new LinearDefinition();
definition.setURL(listsURL);
definition.load();
int linesCnt = definition.size();
listsByNode = definition.loadLists();
// allocate the hashmap for the first words from the phrases
mapsList.add(new HashMap(linesCnt * 10));
mapsListSize = mapsList.size();
// allocate the category Map with optimal initial capacity & load factor
categoryList = new ArrayList<Lookup>(linesCnt + 1);
Iterator<LinearNode> inodes = definition.iterator();
LinearNode node;
int nodeIdx = 0;
while(inodes.hasNext()) {
node = inodes.next();
fireStatusChanged(READING + node.toString());
fireProgressChanged(++nodeIdx * 100 / linesCnt);
readList(node, true);
} // while
fireProcessFinished();
} catch(Exception x) {
throw new ResourceInstantiationException(x);
} // catch
}
return this;
} // Resource init()throws ResourceInstantiationException
/**
* Re-initialize this gazetteer by re-loading the configuration.
*/
public void reInit() throws ResourceInstantiationException {
mapsList = null;
categoryList = null;
init();
}
/**
* gets the phrases/lines of a gazetteer list stores them in the maps opposed
* to a lookup.
*
* @param node
* a linear node(line from the linear definition)
* @param add
* @add if <b>true</b> will add the phrases found in the list to the ones
* recognised by this gazetter, if <b>false</b> the phrases found in the
* list will be removed from the list of phrases recognised by this
* gazetteer.
* @throws FileNotFoundException
* @throws IOException
* @throws GazetteerException
* @return void
*/
void readList(LinearNode node, boolean add) throws GazetteerException {
String listName, majorType, minorType, languages;
if(null == node) { throw new GazetteerException(" LinearNode node is null "); }
listName = node.getList();
majorType = node.getMajorType();
minorType = node.getMinorType();
languages = node.getLanguage();
GazetteerList gazList = (GazetteerList)listsByNode.get(node);
if(null == gazList) { throw new GazetteerException(
"gazetteer list not found by node"); }
// create a lookup object for the current category
Lookup lookup = new Lookup(listName, majorType, minorType, languages);
if(null != mappingDefinition) {
MappingNode mnode = mappingDefinition.getNodeByList(listName);
if(null != mnode) {
lookup.oClass = mnode.getClassID();
lookup.ontology = mnode.getOntologyID();
}
}// if mapping def
lookup.list = listName;
Iterator iline = gazList.iterator();
String line;
// add the following lines to the gazetteer
if(add) {
while(iline.hasNext()) {
line = iline.next().toString();
this.add(line, lookup);
} // while there are lines to be processed
// remove the following lines from the gazetteer
} else {
while(iline.hasNext()) {
// currently no implementation of remove
line = iline.next().toString();
} // while there are lines to be processed
} // else remove
} // void readList(String listDesc)
/**
* This method runs the gazetteer. It parses the document and looks-up the
* parsed phrases from the maps, in which the phrases vs. annotations are set,
* in order to generate an annotation set. It assumes that all the needed
* parameters are set. If they are not, an exception will be fired.
*/
public void execute() throws ExecutionException {
AnnotationSet annotationSet;
// check the input
if(document == null) { throw new ExecutionException(
"No document to process!"); } // if document is null
if(annotationSetName == null || annotationSetName.equals(EMPTY_STR))
annotationSet = document.getAnnotations();
else annotationSet = document.getAnnotations(annotationSetName);
fireStatusChanged(DOING_LOOKUP_IN + document.getSourceUrl().getFile() +
DOTS);
// get the content of the document and its length
String content = document.getContent().toString();
// init some params
int length = content.length();
int matchedRegionEnd = 0;
int matchedRegionStart = 0;
// word start
int iwordStart = 0;
int iend = 0;
int secondWordStart = 0;
String phrase = EMPTY_STR;
int mapIndex = 0;
FeatureMap fm;
Map currentMap = new HashMap();
List currentLookup = null;
// whether the current word is the first in the phrase
boolean firstWord = true;
boolean punctuationZone = false;
// letter to number or number to letter transition zone
boolean l2nORn2lZone = false;
char currentChar = 0;
int typeWeight = 0;
// note that the code within the next cycle is overwhelmed by complexity
// this said status was reached because what actually this cycle does
// is ad hoc tokenization of ... guess what ... phrases
// if you're still not sure this is a complex task-here are some examples :
// Cable & Wireless
// Moody's
// C.V.
// C.de R.L.
// C.por A.
// AG & Co KG
// G.S.C.
// S.A.R.L.
// etc...
// so in this cycle we monitor several flags and several indexes
// they all should speak for themsleves e.g. boolean firstWord
// boolean punctuationZone tells us whether we're within the context of a
// punctuation sign (e.g. '.','&',';')
// length is the length of the doc.
// iwordStart is the starting index within the doc of the current 'word'
// being processed
// iend is the end of the current word/phrase
// content is the content of the doc
// mapIndex is the index of the current map, this is also the index of the
// current 'part' of the phrase(parts are groups of characters separated by
// whitespaces,and in some cases by punctuation)
// matchedRegion(Start/End) denote the start/emd of an already looked-up
// phrase( annotated )
// secondWordStart is the index of the second word in the currently
// processed phrase
// l2nORn2lZone indicates whether we're in a zone where
// transition from letter to number or number to letter is present
// e.g. GBP100 the "P1" is such a zone
// typeWeight is the summed weight of the types of the last two chars
while(iwordStart < length) {
if(firstWord)
// ultimately : starts the new lookup operation from
// the second word of the previous phrase
iwordStart = secondWordStart;
else iwordStart = iend;
// additional check
if(iwordStart >= length) break;
// get the beginning of the word
while(iwordStart < length &&
(Character.isWhitespace(content.charAt(iwordStart)) || isWhiteSpacePunctuation(content
.charAt(iwordStart)))) {
iwordStart++;
} // while find start of word
// get the end of the word
iend = iwordStart + 1;
// tired of all these checks ?
// bro why you bother to look @ this code @ all ???
if(iend >= length) break;
// do while punctuation
do {
int currCharInt;
if(punctuationZone) {
currentChar = content.charAt(iend);
} else while(iend < length &&
(Character.isLetterOrDigit(currentChar = content.charAt(iend)) ||
// handling for ch and etc. cyrillic letters that fail the above check
((215 == (currCharInt = currentChar)) || (currCharInt == 168) ||
(currCharInt == 247) || (currCharInt == 184))) ||
((isDashOrQuotePunctuation(currentChar)) && (Character
.isWhitespace(content.charAt(iend - 1)) || isWhiteSpacePunctuation(content
.charAt(iend - 1))))) {
// check whether the neighbouring chars are letter number
// or number letter
typeWeight =
Character.getType(currentChar) +
Character.getType(content.charAt(iend - 1));
if(l2nORn2lZone = (typeWeight == 10) || (typeWeight == 11)) {
break;
} // if l2nORn2lZone
iend++;
} // while find end of word
// build phrase
if(firstWord) {
phrase = content.substring(iwordStart, iend);
// maintain the case of a token without whitespaces
// but beginning with punctuation e.g. : "The or &BT
// in this case we should set the start of the next word to
// iwordStart + 1;
// it is the same when when l2nORn2lZone = true
if((isDashOrQuotePunctuation(content.charAt(iwordStart))) ||
l2nORn2lZone) {
secondWordStart = iwordStart + 1;
} else {
secondWordStart = iend;
} // else
matchedRegionStart = iwordStart;
mapIndex = 0;
firstWord = false;
} else {
if(punctuationZone || l2nORn2lZone) {
// close a punctuation zone or a l2nORn2lZone ?
if(Character.isWhitespace(currentChar) ||
isWhiteSpacePunctuation(currentChar)) {
punctuationZone = false;
l2nORn2lZone = false;
mapIndex++;
break;
} // if it's whitespace
else {
phrase = phrase + currentChar;
iend++;
} // still in punctuation zone
} else {
phrase = phrase + ' ' + content.substring(iwordStart, iend);
} // not a punctuation zone neither l2nORn2lZone
} // not a first word
// determine punctuatuion zone
if(isDashOrQuotePunctuation(currentChar) &&
!Character.isWhitespace(content.charAt(iend - 1))) {
firstWord = false;
} // if punctuation
// determine l2nORn2lZone
typeWeight =
Character.getType(currentChar) +
Character.getType(content.charAt(iend - 1));
if((typeWeight == 10) || (typeWeight == 11)) {
l2nORn2lZone = true;
firstWord = false;
} // if typeWeight ...
// check mapindex's validity
if(mapIndex >= mapsListSize) {
firstWord = true;
punctuationZone = false;
l2nORn2lZone = false;
continue;
} // if mapindex out of bounds
// try to find it in the dark cave ...
currentMap = mapsList.get(mapIndex);
// if found in current map then set matchedRegion
phrase = trunxSuffixVowelsFromPhrase(phrase);
if(currentMap.containsKey(phrase)) {
currentLookup = (ArrayList)currentMap.get(phrase);
if(null != currentLookup) {
matchedRegionEnd = iend;
// generate lookups for the phrase so far
{
Iterator lookupIter = currentLookup.iterator();
Lookup lookup;
while(lookupIter.hasNext()) {
lookup = (Lookup)lookupIter.next();
fm = Factory.newFeatureMap();
fm.put(MAJOR_TYPE_STR, lookup.majorType);
if(null != lookup.minorType)
fm.put(MINOR_TYPE_STR, lookup.minorType);
if(null != lookup.languages)
fm.put(LANGUAGE, lookup.languages);
try {
annotationSet.add(new Long(matchedRegionStart), new Long(
matchedRegionEnd), LOOKUP_ANNOTATION_TYPE, fm);
} catch(InvalidOffsetException ioe) {
throw new LuckyException(ioe.toString());
} // catchx
}// while(lookupIter.hasNext())
} // generate lookups for the phraseso far
} // if not null lookup
} else {
if(!punctuationZone && !l2nORn2lZone) {
// the map doesn't contain the key
iend = secondWordStart;
firstWord = true;
continue;
} // if critical iteration
} // else
// jump to the next map only if not within a punctuation context
// neither in a l2nORn2l Zone
if(!punctuationZone && !l2nORn2lZone) ++mapIndex;
// if the current map index reached the size of the map list
if(mapIndex >= mapsListSize) {
firstWord = true;
punctuationZone = false;
continue;
} // if mapIndex out of bounds
} while((punctuationZone || l2nORn2lZone) && iend < length);
// if end of boundaries for iend reached then set exclusively firstWord
// this will cause the program to start from secondWordStart or
// matchedRegionEnd indexes.
if(iend >= length || iwordStart >= length) {
iend = secondWordStart;
firstWord = true;
// last change
punctuationZone = false;
} // if iend out of boundaries
} // while within content
fireProcessFinished();
fireStatusChanged("Gazetteer processing finished!");
} // execute ()
public Set lookup(String singleItem) {
Set result = null;
for(int li = 0; li < mapsListSize; li++) {
Map list = mapsList.get(li);
if(list.containsKey(singleItem)) {
ArrayList lookupList = (ArrayList)list.get(singleItem);
if(lookupList != null && lookupList.size() > 0) {
result = new HashSet(lookupList);
break;
}
}
} // for lists
return result;
}
public boolean remove(String singleItem) {
boolean isRemoved = false;
for(int i = 0; i < mapsListSize; i++) {
Map map = mapsList.get(i);
if(map.containsKey(singleItem)) {
map.remove(singleItem);
isRemoved = true;
break;
}
} // for lists
return isRemoved;
}
public boolean add(String singleItem, Lookup lookup) {
// ALL-UPPER-CASE SUPPORT
String upper = singleItem.toUpperCase();
if(!upper.equals(singleItem)) {
this.add(upper, lookup);
} // avoid endless recursion
// if the item is not with first capital - make it with first capital and
// add it
if(singleItem.length() > 1) {
String firstLetter = singleItem.substring(0, 1);
if(!firstLetter.equals(firstLetter.toUpperCase())) {
this.add(firstLetter.toUpperCase() + singleItem.substring(1), lookup);
}
}
// stems the word (or words in phrase) and adds it as a new gaz entry if it
// differs from the currently being added phrase/word
{
String stem = trunxSuffixVowelsFromPhrase(singleItem);
if(!stem.equals(singleItem)) {
this.add(stem, lookup);
}
}
// category key
ArrayList<Lookup> key = new ArrayList<Lookup>(1);
// add the lookup to the current key
key.add(lookup);
// add the lookup to the category list
categoryList.add(lookup);
// init some params
String line = singleItem;
int mapIndex = -1;
String word = null;
List<Lookup> oldKey = null;
Map<String, List<Lookup>> currentMap = new HashMap<String, List<Lookup>>();
int length = 0;
line = singleItem;
mapIndex = -1;
line = line.trim();
length = line.length();
for(int lineIndex = 0; lineIndex < length; lineIndex++) {
if((lineIndex + 1 == length) ||
(Character.isWhitespace(line.charAt(lineIndex)))) {
// if not whitespace but end of line then the index should equal the
// length
if(lineIndex + 1 == length) lineIndex = length;
// get the word
word = line.substring(0, lineIndex).trim();
// if the map doesn't exist : create it
++mapIndex;
if(mapsListSize <= mapIndex) {
mapsList.add(new HashMap());
mapsListSize++;
} // if the map doesn't exist
// get the map and add the word to the map
currentMap = (mapsList.get(mapIndex));
// try to get the current word
// if there isn't such a word : add it with null key.
if(!currentMap.containsKey(word)) {
currentMap.put(word, null);
} // add the word
} // if whitespace
} // for line iterate
// !!! put the category key in the last map
oldKey = currentMap.get(word);
if(null == oldKey) {
currentMap.put(word, key);
} else {
// merge the two arraylists
// and check to avoid duplicity of lookups
// note that key's length is 1
ArrayList<Lookup> mergedKey = new ArrayList<Lookup>(oldKey);
boolean duplicity = false;
for(int i = 0; i < oldKey.size(); i++) {
duplicity = mergedKey.get(i).equals(key.get(0));
} // for i
if(!duplicity) mergedKey.add(key.get(0));
// put the merged key in the map
currentMap.put(word, mergedKey);
} // else
return true;
} // add
private boolean isDashOrQuotePunctuation(char ch) {
int type = Character.getType(ch);
if(Character.DASH_PUNCTUATION == type ||
Character.INITIAL_QUOTE_PUNCTUATION == type ||
Character.FINAL_QUOTE_PUNCTUATION == type || ch == '.') { return true; }
return false;
} // isDashOrQuotePunctuation(ch)
private boolean isWhiteSpacePunctuation(char ch) {
int type = Character.getType(ch);
if((Character.OTHER_PUNCTUATION == type ||
Character.CONNECTOR_PUNCTUATION == type ||
Character.START_PUNCTUATION == type || Character.END_PUNCTUATION == type) &&
(ch != '.')) { return true; }
return false;
} // isWhiteSpacePunctuation(ch)
public String trunxSuffixVowelsFromPhrase(String phrase) {
String line = phrase;
int length = phrase.length();
//String word;
StringBuffer stem = new StringBuffer();
int lastWordEnd = 0;
String justWord;
for(int lineIndex = 0; lineIndex < length; lineIndex++) {
if((lineIndex + 1 == length) ||
(Character.isWhitespace(line.charAt(lineIndex)))) {
// if not whitespace but end of line then the index should equal the
// length
if(lineIndex + 1 == length) lineIndex = length;
// get the word
//word = line.substring(0, lineIndex).trim();
justWord = line.substring(lastWordEnd, lineIndex).trim();
stem.append(trunxSuffixVowelsFromWord(justWord)).append(" ");
lastWordEnd = lineIndex;
} // if whitespace
} // for line iterate
// if (phrase.length() != stem.toString().trim().length() ) {
// System.out.println(phrase+" -> "+stem.toString());
// }
return stem.toString().trim();
} // trunxSuffixVowelsFromPhrase()
public String trunxSuffixVowelsFromWord(String word) {
int len = word.length();
String lastCh;
int trunxCount = 0;
while(len > minWordLength && trunxCount < maxTruncatedVowels) {
lastCh = word.substring(len - 1);
if(SET_OF_VOWELS.contains(lastCh)) {
word = word.substring(0, len - 1);
trunxCount++;
} else {
// sufix2l = word.substring(len-2);
// if (SET_OF_CONSONANT_SUFFIXES.contains(sufix2l)){
// word = word.substring(0,len-2);
// }
return word;
} // not a vowel
len--;
}
return word;
} // trunxSuffixVowelsFromWord()
@HiddenCreoleParameter
@CreoleParameter(comment="not supported by this gazetteer", defaultValue="true")
public void setCaseSensitive(Boolean newCaseSensitive) {
caseSensitive = newCaseSensitive;
}
@HiddenCreoleParameter
@RunTime
@CreoleParameter(comment="not supported by this gazetteer", defaultValue="true")
public void setLongestMatchOnly(Boolean longestMatchOnly) {
this.longestMatchOnly = longestMatchOnly;
}
@HiddenCreoleParameter
@RunTime
@CreoleParameter(comment="not supported by this gazetteer", defaultValue="true")
public void setWholeWordsOnly(Boolean wholeWordsOnly) {
this.wholeWordsOnly = wholeWordsOnly;
}
/**
* For internal use by the duplication mechanism.
*/
@Sharable
public void setMapsList(List<Map> mapsList) {
this.mapsList = mapsList;
}
/**
* For internal use by the duplication mechanism.
*/
public List<Map> getMapsList() {
return mapsList;
}
/**
* For internal use by the duplication mechanism.
*/
@Sharable
public void setCategoryList(ArrayList<Lookup> categoryList) {
this.categoryList = categoryList;
}
/**
* For internal use by the duplication mechanism.
*/
public ArrayList<Lookup> getCategoryList() {
return categoryList;
}
} // class Russ Gazetteer