SentenceSplitter.java
001 /*
002  *  Copyright (c) 1995-2011, The University of Sheffield. See the file
003  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
004  *
005  *  This file is part of GATE (see http://gate.ac.uk/), and is free
006  *  software, licenced under the GNU Library General Public License,
007  *  Version 2, June 1991 (in the distribution as file licence.html,
008  *  and also available at http://gate.ac.uk/gate/licence.html).
009  *
010  *  Valentin Tablan, 01 Feb 2000
011  *
012  *  $Id: SentenceSplitter.java 18633 2015-04-13 12:23:47Z markagreenwood $
013  */
014 
015 package gate.creole.splitter;
016 
017 import gate.AnnotationSet;
018 import gate.Factory;
019 import gate.FeatureMap;
020 import gate.Gate;
021 import gate.Resource;
022 import gate.creole.AbstractLanguageAnalyser;
023 import gate.creole.ExecutionException;
024 import gate.creole.ExecutionInterruptedException;
025 import gate.creole.ResourceInstantiationException;
026 import gate.creole.Transducer;
027 import gate.creole.gazetteer.DefaultGazetteer;
028 import gate.creole.metadata.CreoleParameter;
029 import gate.creole.metadata.CreoleResource;
030 import gate.creole.metadata.Optional;
031 import gate.creole.metadata.RunTime;
032 import gate.event.ProgressListener;
033 import gate.event.StatusListener;
034 import gate.util.Benchmark;
035 import gate.util.Benchmarkable;
036 import gate.util.GateRuntimeException;
037 import gate.util.InvalidOffsetException;
038 
039 /**
040  * A sentence splitter. This is module contains a tokeniser, a
041  * gazetteer and a Jape grammar. This class is used so we can have a different
042  * entry in the creole.xml file describing the default resources and to add
043  * some minor processing after running the components in order to extract the
044  * results in a usable form.
045  */
046 @CreoleResource(name="ANNIE Sentence Splitter", comment="ANNIE sentence splitter.", helpURL="http://gate.ac.uk/userguide/sec:annie:splitter", icon="sentence-splitter")
047 public class SentenceSplitter extends AbstractLanguageAnalyser implements Benchmarkable{
048 
049   private static final long serialVersionUID = -5335682060379173111L;
050 
051   public static final String
052     SPLIT_DOCUMENT_PARAMETER_NAME = "document";
053 
054   public static final String
055     SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
056 
057   public static final String
058     SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
059 
060   public static final String
061     SPLIT_ENCODING_PARAMETER_NAME = "encoding";
062 
063   public static final String
064     SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL";
065 
066   public static final String
067     SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL";
068   
069   
070   private String benchmarkId;
071 
072   @Override
073   public Resource init()throws ResourceInstantiationException{
074     //create all the componets
075     FeatureMap params;
076     FeatureMap features;
077 
078     params = Factory.newFeatureMap();
079     if(gazetteerListsURL != null)
080       params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME,
081               gazetteerListsURL);
082     params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding);
083 
084     if (gazetteer == null) {
085       //gazetteer
086       fireStatusChanged("Creating the gazetteer");
087       features = Factory.newFeatureMap();
088       Gate.setHiddenAttribute(features, true);
089 
090       gazetteer = (DefaultGazetteer)Factory.createResource(
091               "gate.creole.gazetteer.DefaultGazetteer",
092               params, features);
093       gazetteer.setName("Gazetteer " + System.currentTimeMillis());
094     }
095     else {
096       gazetteer.setParameterValues(params);
097       gazetteer.reInit();
098     }
099     
100     fireProgressChanged(10);
101 
102     params = Factory.newFeatureMap();
103     if(transducerURL != null)
104       params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL);
105     params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
106 
107     if (transducer == null) {
108       //transducer
109       fireStatusChanged("Creating the JAPE transducer");
110       features = Factory.newFeatureMap();
111       Gate.setHiddenAttribute(features, true);
112 
113       transducer = (Transducer)Factory.createResource(
114               "gate.creole.Transducer",
115               params, features);
116       transducer.setName("Transducer " + System.currentTimeMillis());
117     }
118     else {
119       transducer.setParameterValues(params);
120       transducer.reInit();
121     }
122     
123     fireProgressChanged(100);
124     fireProcessFinished();
125 
126     return this;
127   }
128   
129   @Override
130   public void cleanup() {
131     Factory.deleteResource(gazetteer);
132     Factory.deleteResource(transducer);
133   }
134 
135   @Override
136   public void execute() throws ExecutionException{
137     interrupted = false;
138     //set the runtime parameters
139     FeatureMap params;
140     if(inputASName != null && inputASName.equals("")) inputASName = null;
141     if(outputASName != null && outputASName.equals("")) outputASName = null;
142     
143     ProgressListener pListener = null;
144     StatusListener sListener = null;
145 
146     fireProgressChanged(5);
147     pListener = new IntervalProgressListener(510);
148     sListener = new StatusListener() {
149       @Override
150       public void statusChanged(String text) {
151         fireStatusChanged(text);
152       }
153     };
154     try {
155       // run the gazetteer
156       params = Factory.newFeatureMap();
157       params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document);
158       params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName);
159       gazetteer.setParameterValues(params);
160 
161       gazetteer.addProgressListener(pListener);
162       gazetteer.addStatusListener(sListener);
163       gazetteer.execute();
164 
165     catch(Exception e) {
166       throw new ExecutionException(e);
167     finally {
168       gazetteer.setDocument(null);
169       gazetteer.removeProgressListener(pListener);
170       gazetteer.removeStatusListener(sListener);
171     }
172 
173     if(isInterrupted())
174       throw new ExecutionInterruptedException("The execution of the \""
175               + getName()
176               "\" sentence splitter has been abruptly interrupted!");
177 
178     pListener = new IntervalProgressListener(1190);
179 
180     try {
181       params = Factory.newFeatureMap();
182       params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
183       params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName);
184       params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName);
185       transducer.setParameterValues(params);
186 
187       transducer.addProgressListener(pListener);
188       transducer.addStatusListener(sListener);
189       Benchmark.executeWithBenchmarking(transducer,
190               Benchmark.createBenchmarkId("SentenceSplitterTransducer",
191                       getBenchmarkId()), this, null);
192 
193     catch(Exception e) {
194       throw new ExecutionException(e);
195     finally {
196       transducer.setDocument(null);
197       transducer.removeProgressListener(pListener);
198       transducer.removeStatusListener(sListener);
199     }
200 
201     //get pointers to the annotation sets
202     AnnotationSet inputAS = (inputASName == null?
203                             document.getAnnotations() :
204                             document.getAnnotations(inputASName);
205 
206     AnnotationSet outputAS = (outputASName == null?
207                              document.getAnnotations() :
208                              document.getAnnotations(outputASName);
209 
210     //copy the results to the output set if they are different
211     if(inputAS != outputAS){
212       outputAS.addAll(inputAS.get(SENTENCE_ANNOTATION_TYPE));
213     }
214 
215     //create one big sentence if none were found
216     AnnotationSet sentences = outputAS.get(SENTENCE_ANNOTATION_TYPE);
217     if(sentences == null || sentences.isEmpty()){
218       //create an annotation covering the entire content
219       try{
220         outputAS.add(new Long(0), document.getContent().size()
221                 SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
222       }catch(InvalidOffsetException ioe){
223         throw new GateRuntimeException(ioe);
224       }
225     }else{
226       //add a sentence covering all the tokens after the last sentence
227       Long endSentences = sentences.lastNode().getOffset();
228       AnnotationSet remainingTokens = inputAS.get(TOKEN_ANNOTATION_TYPE, endSentences,
229                                                   inputAS.lastNode().getOffset());
230       if(remainingTokens != null && !remainingTokens.isEmpty()){
231         try{
232           outputAS.add(remainingTokens.firstNode().getOffset(),
233                        remainingTokens.lastNode().getOffset(),
234                        SENTENCE_ANNOTATION_TYPE,
235                        Factory.newFeatureMap());
236         }catch(InvalidOffsetException ioe){
237           throw new ExecutionException(ioe);
238         }
239       }
240     }
241     fireProcessFinished();
242   }//execute()
243 
244   /**
245    * Notifies all the PRs in this controller that they should stop their
246    * execution as soon as possible.
247    */
248   @Override
249   public synchronized void interrupt(){
250     interrupted = true;
251     gazetteer.interrupt();
252     transducer.interrupt();
253   }
254 
255   @Optional
256   @CreoleParameter(defaultValue="resources/sentenceSplitter/grammar/main-single-nl.jape", comment="The URL to the custom Jape grammar file", suffixes="jape")
257   public void setTransducerURL(java.net.URL newTransducerURL) {
258     transducerURL = newTransducerURL;
259   }
260   public java.net.URL getTransducerURL() {
261     return transducerURL;
262   }
263   DefaultGazetteer gazetteer;
264   Transducer transducer;
265   private java.net.URL transducerURL;
266   private String encoding;
267   private java.net.URL gazetteerListsURL;
268 
269 
270   @CreoleParameter(comment="The encoding used for reading the definition files", defaultValue="UTF-8")
271   public void setEncoding(String newEncoding) {
272     encoding = newEncoding;
273   }
274   public String getEncoding() {
275     return encoding;
276   }
277   
278   @Optional
279   @CreoleParameter(defaultValue="resources/sentenceSplitter/gazetteer/lists.def", comment="The URL to the custom list lookup definition file", suffixes="def")
280   public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) {
281     gazetteerListsURL = newGazetteerListsURL;
282   }
283   public java.net.URL getGazetteerListsURL() {
284     return gazetteerListsURL;
285   }
286   
287   @RunTime
288   @Optional
289   @CreoleParameter(comment="The annotation set to be used as input that must contain 'Token' annotations")
290   public void setInputASName(String newInputASName) {
291     inputASName = newInputASName;
292   }
293 
294   public String getInputASName() {
295     return inputASName;
296   }
297   
298   @RunTime
299   @Optional
300   @CreoleParameter(comment="The annotation set to be used as output for 'Sentence' and 'Split' annotations")
301   public void setOutputASName(String newOutputASName) {
302     outputASName = newOutputASName;
303   }
304   public String getOutputASName() {
305     return outputASName;
306   }
307   
308   /* (non-Javadoc)
309    * @see gate.util.Benchmarkable#getBenchmarkId()
310    */
311   @Override
312   public String getBenchmarkId() {
313     if(benchmarkId == null) {
314       return getName();
315     }
316     else {
317       return benchmarkId;
318     }
319   }
320 
321   /* (non-Javadoc)
322    * @see gate.util.Benchmarkable#setBenchmarkId(java.lang.String)
323    */
324   @Override
325   public void setBenchmarkId(String benchmarkId) {
326     this.benchmarkId = benchmarkId;
327   }
328 
329   private String inputASName;
330   private String outputASName;
331 }//public class SentenceSplitter extends Nerc