package gate.lingpipe; import gate.ProcessingResource; import gate.Resource; import gate.creole.*; import gate.util.*; import gate.*; import java.util.*; import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.Tokenizer; /** * LingPipe Tokenizer PR. * * @author Ekaterina Stambolieva * */ public class TokenizerPR extends AbstractLanguageAnalyser implements ProcessingResource { /** * Name of the output annotation set */ private String outputASName; /** * tokeniser factory used for tokenizing text */ private com.aliasi.tokenizer.TokenizerFactory tf; /** Initialise this resource, and return it. */ public Resource init() throws ResourceInstantiationException { // construct tokenizer tf = IndoEuropeanTokenizerFactory.INSTANCE; return this; } public void reInit() throws ResourceInstantiationException { init(); } /** * execute method. Makes LingPipe API calls to tokenize the document. * It uses the document's string and passes it over to the LingPipe to * tokenize. It also generates space tokens as well. */ public void execute() throws ExecutionException { if(document == null) { throw new ExecutionException("There is no loaded document"); } super.fireProgressChanged(0); long startOffset = 0, endOffset = 0; AnnotationSet as = null; if(outputASName == null || outputASName.trim().length() == 0) as = document.getAnnotations(); else as = document.getAnnotations(outputASName); String docContent = document.getContent().toString(); List<String> tokenList = new ArrayList<String>(); List<String> whiteList = new ArrayList<String>(); Tokenizer tokenizer = tf.tokenizer(docContent.toCharArray(), 0, docContent .length()); tokenizer.tokenize(tokenList, whiteList); for(int i = 0; i < whiteList.size(); i++) { try { startOffset = endOffset; endOffset = startOffset + whiteList.get(i).length(); if((endOffset - startOffset) != 0) { FeatureMap fmSpaces = Factory.newFeatureMap(); fmSpaces.put("length", "" + (endOffset - startOffset)); as.add(new Long(startOffset), new Long(endOffset), "SpaceToken", fmSpaces); } if(i < tokenList.size()) { startOffset = endOffset; endOffset = startOffset + tokenList.get(i).length(); FeatureMap fmTokens = Factory.newFeatureMap(); fmTokens.put("length", "" + (endOffset - startOffset)); as.add(new Long(startOffset), new Long(endOffset), "Token", fmTokens); } } catch(InvalidOffsetException e) { throw new ExecutionException(e); } } } /** * gets name of the output annotation set * @return */ public String getOutputASName() { return outputASName; } /** * sets name of the output annotaiton set * @param outputAS */ public void setOutputASName(String outputAS) { this.outputASName = outputAS; } }