package gate.lingpipe; import gate.AnnotationSet; import gate.FeatureMap; import gate.ProcessingResource; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.util.InvalidOffsetException; import java.util.Iterator; import java.util.Set; import com.aliasi.chunk.Chunk; import com.aliasi.chunk.Chunking; import com.aliasi.sentences.MedlineSentenceModel; import com.aliasi.sentences.SentenceChunker; import com.aliasi.sentences.SentenceModel; import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; /** * The Sentence splitter takes a document and find the sentence within. * @author Ekaterina Mihaylova */ public class SentenceSplitterPR extends AbstractLanguageAnalyser implements ProcessingResource { /** * Instance of the tokeniser */ static final TokenizerFactory TOKENIZER_FACTORY = IndoEuropeanTokenizerFactory.INSTANCE; /** * Sentence model */ static final SentenceModel SENTENCE_MODEL = new MedlineSentenceModel(); /** * Sentence chunker */ static final SentenceChunker SENTENCE_CHUNKER = new SentenceChunker( TOKENIZER_FACTORY, SENTENCE_MODEL); /** * Name of the annotation set */ private String outputASName; /** * Gets name of the output annotation set where the Sentence annotations are stored * @return */ public String getOutputASName() { return outputASName; } /** * Sets name of the output annotation set where the Sentence annotations are stored * @param outputAS */ public void setOutputASName(String outputAS) { this.outputASName = outputAS; } /** Initialise this resource, and return it. */ public Resource init() throws ResourceInstantiationException { return super.init(); } /** * Reinitialises the processing resource. After calling this method the * resource should be in the state it is after calling init. If the resource * depends on external resources (such as rules files) then the resource * will re-read those resources. If the data used to create the resource has * changed since the resource has been created then the resource will change * too after calling reInit(). */ public void reInit() throws ResourceInstantiationException { init(); } /** * This method runs the coreferencer. It assumes that all the needed * parameters are set. If they are not, an exception will be fired. */ public void execute() throws ExecutionException { if (document == null) { throw new ExecutionException("The document can't be null"); } AnnotationSet set = null; if (outputASName == null || outputASName.trim().length() == 0){ set = document.getAnnotations(); }else{ set = document.getAnnotations(outputASName); } fireProgressChanged(0); String text = document.getContent().toString(); Chunking chunking = SENTENCE_CHUNKER.chunk(text.toCharArray(), 0, text .length()); Set sentences = chunking.chunkSet(); if (sentences.size() < 1) { System.out.println("No sentence chunks found."); return; } FeatureMap map = gate.Factory.newFeatureMap(); int i=1; for (Iterator it = sentences.iterator(); it.hasNext();i++) { Chunk sentence = (Chunk) it.next(); int start = sentence.start(); int end = sentence.end(); try { set.add(new Long(start), new Long(end), "Sentence", map); } catch (InvalidOffsetException e) { throw new ExecutionException(e); } fireProgressChanged(100*i/sentences.size()); } fireProcessFinished(); } }