package gate.lingpipe;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.GateRuntimeException;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import com.aliasi.classify.Classification;
import com.aliasi.classify.Classifier;
import com.aliasi.util.AbstractExternalizable;
/**
* A Processing resource to identify language of the document based on
* LingPipe language identifier classifier. Please download appropriate
* models from the LingPipe website. see
* http://alias-i.com/lingpipe/web/models.html
*
* The default model supplied with GATE distribution is based on the
* Leipzig corpora collection that consists of the data in the following
* languages: Catalan (cat), Danish (dk), English (en), Estonian (ee),
* Finnish (fi), French (fr), German (de), Italian (it), Japanese (jp),
* Korean (kr), Norwegian (no), Sorbian (sorb), Swedish (se), and
* Turkish (tr).
*
* Should you want to train models on other languages or different
* dataset, please refer to the URL: *
* http://alias-i.com/lingpipe/demos/tutorial/langid/read-me.html.
*
* @author niraj
*
*/
public class LanguageIdentifierPR extends AbstractLanguageAnalyser implements
ProcessingResource {
/** File which cotains model for NE */
protected URL modelFileUrl;
/** Model file extracted from the URL */
protected File modelFile;
/** classifier object */
protected Classifier<CharSequence, Classification> classifier;
/** document feature name */
protected String languageIdFeatureName;
/**
* Initializes this resource
*
* @return Resource
* @throws ResourceInstantiationException
*/
public Resource init() throws ResourceInstantiationException {
if(modelFileUrl == null)
throw new ResourceInstantiationException("No model file provided!");
try {
modelFile = new File(modelFileUrl.toURI());
}
catch(URISyntaxException e) {
throw new ResourceInstantiationException(e);
}
if(modelFile == null || !modelFile.exists()) {
throw new ResourceInstantiationException("modelFile:"
+ modelFileUrl.toString() + " does not exists");
}
try {
classifier = (Classifier<CharSequence, Classification>)AbstractExternalizable
.readObject(modelFile);
}
catch(IOException e) {
throw new ResourceInstantiationException(e);
}
catch(ClassNotFoundException e) {
throw new ResourceInstantiationException(e);
}
return this;
}
/**
* Method is executed after the init() method has finished its
* execution. <BR>
*
* @throws ExecutionException
*/
public void execute() throws ExecutionException {
// lets start the progress and initialize the progress counter
fireProgressChanged(0);
// If no document provided to process throw an exception
if(document == null) {
fireProcessFinished();
throw new GateRuntimeException("No document to process!");
}
// langugage ID feature Name
if(languageIdFeatureName == null
|| languageIdFeatureName.trim().length() == 0)
languageIdFeatureName = "lang";
String docText = document.getContent().toString();
Classification classification = classifier.classify(docText);
document.getFeatures().put(languageIdFeatureName,
classification.bestCategory());
// process finished, acknowledge user about this.
fireProcessFinished();
}
/**
* gets the model to be used for identifying language of the document
*
* @return
*/
public URL getModelFileUrl() {
return modelFileUrl;
}
/**
* sets the model to be used for identifying language of the document
*
* @param modelFileUrl
*/
public void setModelFileUrl(URL modelFileUrl) {
this.modelFileUrl = modelFileUrl;
}
/**
* gets name of the feature which is used for storing the identified
* language
*
* @return
*/
public String getLanguageIdFeatureName() {
return languageIdFeatureName;
}
/**
* sets name of the feature that should be used for storing the
* identified language of the document
*
* @param languageIdFeatureName
*/
public void setLanguageIdFeatureName(String languageIdFeatureName) {
this.languageIdFeatureName = languageIdFeatureName;
}
}