DefaultTokeniser.java
001 package gate.creole.tokeniser;
002 
003 import gate.Factory;
004 import gate.FeatureMap;
005 import gate.Gate;
006 import gate.Resource;
007 import gate.creole.AbstractLanguageAnalyser;
008 import gate.creole.ExecutionException;
009 import gate.creole.ExecutionInterruptedException;
010 import gate.creole.ResourceInstantiationException;
011 import gate.creole.Transducer;
012 import gate.creole.metadata.CreoleParameter;
013 import gate.creole.metadata.CreoleResource;
014 import gate.creole.metadata.Optional;
015 import gate.creole.metadata.RunTime;
016 import gate.event.ProgressListener;
017 import gate.event.StatusListener;
018 import gate.util.Benchmark;
019 import gate.util.Benchmarkable;
020 import gate.util.Out;
021 
022 /**
023  * A composed tokeniser containing a {@link SimpleTokeniser} and a
024  {@link gate.creole.Transducer}.
025  * The simple tokeniser tokenises the document and the transducer processes its
026  * output.
027  */
028 @CreoleResource(name = "ANNIE English Tokeniser", comment = "A customisable English tokeniser.", helpURL = "http://gate.ac.uk/userguide/sec:annie:tokeniser", icon = "tokeniser")
029 public class DefaultTokeniser extends AbstractLanguageAnalyser implements Benchmarkable {
030 
031   private static final long serialVersionUID = 3860943928124433852L;
032 
033   public static final String
034     DEF_TOK_DOCUMENT_PARAMETER_NAME = "document";
035 
036   public static final String
037     DEF_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
038 
039   public static final String
040     DEF_TOK_TOKRULES_URL_PARAMETER_NAME = "tokeniserRulesURL";
041 
042   public static final String
043     DEF_TOK_GRAMRULES_URL_PARAMETER_NAME = "transducerGrammarURL";
044 
045   public static final String
046     DEF_TOK_ENCODING_PARAMETER_NAME = "encoding";
047 
048   public DefaultTokeniser() {
049   }
050 
051 
052   /** Initialise this resource, and return it. */
053   @Override
054   public Resource init() throws ResourceInstantiationException{
055     try{
056       //init super object
057       super.init();
058       //create all the componets
059       FeatureMap params;
060       FeatureMap features;
061 
062       params = Factory.newFeatureMap();
063       if(tokeniserRulesURL != null)
064         params.put(SimpleTokeniser.SIMP_TOK_RULES_URL_PARAMETER_NAME,
065                    tokeniserRulesURL);
066       params.put(SimpleTokeniser.SIMP_TOK_ENCODING_PARAMETER_NAME, encoding);
067 
068       if (tokeniser == null) {
069         //tokeniser
070         fireStatusChanged("Creating a tokeniser");
071         if(DEBUGOut.prln("Parameters for the tokeniser: \n" + params);
072         features = Factory.newFeatureMap();
073         Gate.setHiddenAttribute(features, true);
074         tokeniser = (SimpleTokeniser)Factory.createResource(
075                 "gate.creole.tokeniser.SimpleTokeniser",
076                 params, features);
077         tokeniser.setName("Tokeniser " + System.currentTimeMillis());
078       }
079       else {
080         tokeniser.setParameterValues(params);
081         tokeniser.reInit();
082       }
083       
084       fireProgressChanged(50);
085 
086       params = Factory.newFeatureMap();
087       if(transducerGrammarURL != null)
088         params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME,
089                 transducerGrammarURL);
090       params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
091 
092       if (transducer == null) {
093         //transducer
094         fireStatusChanged("Creating a Jape transducer");
095         if(DEBUGOut.prln("Parameters for the transducer: \n" + params);
096         features = Factory.newFeatureMap();
097         Gate.setHiddenAttribute(features, true);
098         transducer = (Transducer)Factory.createResource("gate.creole.Transducer",
099                 params, features);
100         transducer.setName("Transducer " + System.currentTimeMillis());
101       }
102       else {
103         transducer.setParameterValues(params);
104         transducer.reInit();
105       }
106       fireProgressChanged(100);
107       fireProcessFinished();
108       
109     }catch(ResourceInstantiationException rie){
110       throw rie;
111     }catch(Exception e){
112       throw new ResourceInstantiationException(e);
113     }
114     return this;
115   }
116   
117   @Override
118   public void cleanup() {
119     Factory.deleteResource(transducer);
120     Factory.deleteResource(tokeniser);
121   }
122 
123   @Override
124   public void execute() throws ExecutionException {
125     interrupted = false;
126 
127     FeatureMap params = null;
128     fireProgressChanged(0);
129 
130     ProgressListener pListener = null;
131     StatusListener sListener = null;
132 
133     try {
134 
135       // tokeniser
136       params = Factory.newFeatureMap();
137       params.put(SimpleTokeniser.SIMP_TOK_DOCUMENT_PARAMETER_NAME, document);
138       params.put(SimpleTokeniser.SIMP_TOK_ANNOT_SET_PARAMETER_NAME,
139               annotationSetName);
140       tokeniser.setParameterValues(params);
141 
142       pListener = new IntervalProgressListener(050);
143       sListener = new StatusListener() {
144         @Override
145         public void statusChanged(String text) {
146           fireStatusChanged(text);
147         }
148       };
149 
150       tokeniser.addProgressListener(pListener);
151       tokeniser.addStatusListener(sListener);
152 
153       Benchmark.executeWithBenchmarking(tokeniser,
154               Benchmark.createBenchmarkId("simpleTokeniser", getBenchmarkId()),
155               this, null);
156 
157     catch(Exception e) {
158       throw new ExecutionException("The execution of the \"" + getName()
159               "\" tokeniser has been abruptly interrupted!", e);
160     finally {
161       tokeniser.removeProgressListener(pListener);
162       tokeniser.removeStatusListener(sListener);
163       tokeniser.setDocument(null);
164     }
165 
166     if(isInterrupted())
167       throw new ExecutionInterruptedException("The execution of the \""
168               + getName() "\" tokeniser has been abruptly interrupted!");
169 
170     try {
171       // transducer
172       params = Factory.newFeatureMap();
173       params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
174       params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, annotationSetName);
175       params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, annotationSetName);
176       transducer.setParameterValues(params);
177 
178       pListener = new IntervalProgressListener(50100);
179       transducer.addProgressListener(pListener);
180       transducer.addStatusListener(sListener);
181 
182       Benchmark.executeWithBenchmarking(transducer,
183               Benchmark.createBenchmarkId("transducer", getBenchmarkId()),
184               this, null);
185 
186     catch(Exception e) {
187       throw new ExecutionException("The execution of the \"" + getName()
188               "\" tokeniser has been abruptly interrupted!", e);
189     finally {
190       transducer.removeProgressListener(pListener);
191       transducer.removeStatusListener(sListener);
192       transducer.setDocument(null);
193     }
194   }// execute
195 
196 
197   /**
198    * Notifies all the PRs in this controller that they should stop their
199    * execution as soon as possible.
200    */
201   @Override
202   public synchronized void interrupt(){
203     interrupted = true;
204     tokeniser.interrupt();
205     transducer.interrupt();
206   }
207 
208   @CreoleParameter(defaultValue="resources/tokeniser/DefaultTokeniser.rules", comment="The URL to the rules file", suffixes="rules")
209   public void setTokeniserRulesURL(java.net.URL tokeniserRulesURL) {
210     this.tokeniserRulesURL = tokeniserRulesURL;
211   }
212   public java.net.URL getTokeniserRulesURL() {
213     return tokeniserRulesURL;
214   }
215   
216   @CreoleParameter(defaultValue="UTF-8", comment="The encoding used for reading the definitions")
217   public void setEncoding(String encoding) {
218     this.encoding = encoding;
219   }
220   public String getEncoding() {
221     return encoding;
222   }
223   
224   @CreoleParameter(defaultValue="resources/tokeniser/postprocess.jape", comment="The URL to the postprocessing transducer", suffixes="jape")
225   public void setTransducerGrammarURL(java.net.URL transducerGrammarURL) {
226     this.transducerGrammarURL = transducerGrammarURL;
227   }
228   public java.net.URL getTransducerGrammarURL() {
229     return transducerGrammarURL;
230   }
231  // init()
232 
233   private static final boolean DEBUG = false;
234 
235   /** the simple tokeniser used for tokenisation*/
236   protected SimpleTokeniser tokeniser;
237 
238   /** the transducer used for post-processing*/
239   protected Transducer transducer;
240   private java.net.URL tokeniserRulesURL;
241   private String encoding;
242   private java.net.URL transducerGrammarURL;
243   private String annotationSetName;
244   private String benchmarkId;
245 
246   @RunTime
247   @Optional
248   @CreoleParameter(comment="The annotation set to be used for the generated annotations")
249   public void setAnnotationSetName(String annotationSetName) {
250     this.annotationSetName = annotationSetName;
251   }
252   public String getAnnotationSetName() {
253     return annotationSetName;
254   }
255   
256   @Override
257   public void setBenchmarkId(String benchmarkId) {
258     this.benchmarkId = benchmarkId;
259   }
260   
261   @Override
262   public String getBenchmarkId() {
263     if(benchmarkId == null) {
264       return getName();
265     }
266     else {
267       return benchmarkId;
268     }
269   }
270 }