package com.ontotext.kim.gate;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;
import gate.util.LuckyException;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.ontotext.kim.KIMConstants;
import com.ontotext.kim.client.model.FeatureConstants;
import com.ontotext.kim.gate.KimLookupParser.AliasLookupDictionary;
import com.ontotext.kim.gate.KimLookupParser.EntityOccuranceHandler;
import com.ontotext.kim.model.AliasCacheImpl;
/**
* The Large KB Gazetteer implemented gazetteer lookup over large knowledge bases
* usually derived from RDF data
*
* @author mnozchev
*/
public class KimGazetteer extends AbstractLanguageAnalyser {
private static final long serialVersionUID = 3380L;
private static Logger log = Logger.getLogger(KimGazetteer.class);
private File dictionaryPath = new File(KIMConstants.KIM_CACHE_PATH);
private boolean forceCaseSensitive = false;
private class Annotater implements EntityOccuranceHandler {
public int annotatedEntities = 0;
public void processEntityOccurance(int start, int end, String instURI, String classURI) {
FeatureMap fm = Factory.newFeatureMap();
if (instURI != null) {
fm.put(FeatureConstants.INSTANCE, instURI);
}
fm.put(FeatureConstants.CLASS, classURI);
try {
annotationSet.add(Long.valueOf(start), Long.valueOf(end),
KIMConstants.LOOKUP, fm);
}
catch (InvalidOffsetException ioe) {
throw new LuckyException(ioe.toString());
}
++annotatedEntities;
if (!kimParser.isInterrupted() && annotationLimit > 0
&& annotatedEntities > annotationLimit) {
log.warn("More than " + annotationLimit +
" lookups found. Interrupting ...");
kimParser.setInterrupted(true);
}
}
}
private int annotationLimit;
/** the annotation set that results from the execution */
protected AnnotationSet annotationSet;
private transient KimLookupParser kimParser = null;
private String annotationSetName;
/** Does the actual loading and parsing of the lists. This method must be
* called before the gazetteer can be used.
* @throws ResourceInstantiationException
* @return returns this resource
*/
public gate.Resource init() throws ResourceInstantiationException {
verifyLoggers("com.ontotext.kim");
verifyLoggers("org.openrdf.sesame");
verifyLoggers("httpclient");
// This doesn't match the specification exactly. Will be improved.
String caseSens = forceCaseSensitive ? KIMConstants.CASE_SENSITIV : KIMConstants.CASE_INSENSITIV;
return init(AliasCacheImpl.getInstance(dictionaryPath, caseSens, getName()));
} // Resource init()
protected gate.Resource init(AliasLookupDictionary outerCache) {
this.kimParser = new KimLookupParser(outerCache);
return this;
} // Resource init(EntitiesCache outerCache)
@Override
public void cleanup() {
super.cleanup();
AliasCacheImpl.releaseCache(dictionaryPath, getName());
}
@Override
public void reInit() throws ResourceInstantiationException {
cleanup();
init();
}
/**
* This method runs the gazetteer. It parses the document and looks-up
* the parsed phrases from the maps, in which the phrases vs. annotations
* are set, in order to generate an annotation set.
* It assumes that all the needed parameters
* are set. If they are not, an exception will be fired.
*/
public void execute() throws ExecutionException {
//check initialization
if (kimParser == null)
throw new ExecutionException("init() must be called after the resource is created or deserialized");
this.kimParser.setInterrupted(false);
//check the input
if (document == null) {
throw new ExecutionException("Document is null!");
} // if document is null
if (annotationSetName == null ||
annotationSetName.length() == 0) {
annotationSet = document.getAnnotations();
}
else {
annotationSet = document.getAnnotations(annotationSetName);
}
String content = document.getContent().toString();
Annotater annot = new Annotater();
this.kimParser.findLookups(content, annot);
log.debug(annot.annotatedEntities + " lookup(s) annotated.");
fireProcessFinished();
if (isInterrupted())
fireStatusChanged("Large KB Gazetteer processing interrupted!");
else
fireStatusChanged("Large KB Gazetteer processing finished!");
} // execute ()
@Override
public synchronized void interrupt() {
super.interrupt();
if (this.kimParser != null)
this.kimParser.setInterrupted(true);
}
public Integer getAnnotationLimit() {
return annotationLimit;
}
public void setAnnotationLimit(Integer annotationLimit) {
this.annotationLimit = annotationLimit != null ? annotationLimit : 0;
}
public URL getDictionaryPath() {
try {
return dictionaryPath.toURI().toURL();
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
}
public void setDictionaryPath(URL dictironaryPath) {
this.dictionaryPath = new File(dictironaryPath.getPath());
}
/**
* Sets the AnnotationSet that will be used at the next run for the newly
* produced annotations.
*/
public void setAnnotationSetName(String newAnnotationSetName) {
annotationSetName = newAnnotationSetName;
}
/**
* Gets the AnnotationSet that will be used at the next run for the newly
* produced annotations.
*/
public String getAnnotationSetName() {
return annotationSetName;
}
public void setForceCaseSensitive(Boolean forceCaseSensitive) {
if (forceCaseSensitive != null)
this.forceCaseSensitive = forceCaseSensitive;
}
public Boolean getForceCaseSensitive() {
return forceCaseSensitive;
}
private void verifyLoggers(String loggerName) {
Logger logger = Logger.getLogger(loggerName);
if (logger.getLevel() == null && logger.getEffectiveLevel().equals(Level.DEBUG)) {
logger.setLevel(Level.INFO);
logger.info(
"Logger " + loggerName + " level set to INFO, overriding the default effective level of DEBUG. " +
"Set the level of " + loggerName + " explictly if required.");
}
}
} // class KimGazetteer