/* * LanguageIdentifier * * Copyright (c) 1995-2011, The University of Sheffield. * * This file is part of GATE (see http://gate.ac.uk/), and is free software, * licenced under the GNU Library General Public License, Version 3, June 2007 * (in the distribution as file licence.html, and also available at * http://gate.ac.uk/gate/licence.html). * * $Id: LanguageIdentifier.java 17698 2014-03-19 09:09:28Z markagreenwood $ */ package org.knallgrau.utils.textcat; import gate.Annotation; import gate.AnnotationSet; import gate.Utils; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; import java.net.URL; import at.knallgrau.textcat.TextCategorizer; @CreoleResource(name = "TextCat Language Identification", comment = "Recognizes the document language using TextCat", icon = "paw-print.png", helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:language-identification") public class LanguageIdentifier extends gate.creole.AbstractLanguageAnalyser { private static final long serialVersionUID = 5831213212185693826L; private TextCategorizer guesser; private String languageFeatureName; private String annotationType; private String annotationSetName; private URL configURL; public LanguageIdentifier init() throws ResourceInstantiationException { try { guesser = new TextCategorizer(configURL); } catch(Exception e) { throw new ResourceInstantiationException( "unable to load TextCat config file", e); } return this; } /** * Based on the document content, recognizes the language and adds a document * feature. */ public void execute() throws ExecutionException { if(document == null || document.getFeatures() == null) return; if(annotationType == null || annotationType.trim().equals("")) { /* * Default situation: classify the whole document and save the result as a * document feature. */ String text = document.getContent().toString(); String category = guesser.categorize(text); document.getFeatures().put(languageFeatureName, category); } else { /* * New option: classify the text underlying each annotation (specified by * AS and type) and save the result as an annotation feature. */ AnnotationSet annotations = document.getAnnotations(annotationSetName).get(annotationType); for(Annotation annotation : annotations) { String text = Utils.stringFor(document, annotation); String category = guesser.categorize(text); annotation.getFeatures().put(languageFeatureName, category); } } } public void reInit() throws ResourceInstantiationException { init(); } @RunTime @Optional @CreoleParameter(comment = "name of document or annotation features for the language identified", defaultValue = "lang") public void setLanguageFeatureName(String languageFeatureName) { this.languageFeatureName = languageFeatureName; } public String getLanguageFeatureName() { return languageFeatureName; } @RunTime @Optional @CreoleParameter(comment = "type of annotations to classify; leave blank for whole-document classification") public void setAnnotationType(String atype) { this.annotationType = atype; } public String getAnnotationType() { return this.annotationType; } @RunTime @Optional @CreoleParameter(comment = " annotation set used for input/output (ignored for whole-document classification)") public void setAnnotationSetName(String inputASName) { this.annotationSetName = inputASName; } public String getAnnotationSetName() { return annotationSetName; } @CreoleParameter(defaultValue = "resources/default-names.conf") public void setConfigURL(URL configURL) { this.configURL = configURL; } public URL getConfigURL() { return configURL; } }