1   /*
2    *  Copyright (c) 1998-2004, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: SentenceSplitter.java,v 1.27 2004/07/21 17:10:06 akshay Exp $
12   */
13  
14  
15  package gate.creole.splitter;
16  
17  import gate.*;
18  import gate.creole.*;
19  import gate.creole.gazetteer.DefaultGazetteer;
20  import gate.event.ProgressListener;
21  import gate.event.StatusListener;
22  import gate.util.InvalidOffsetException;
23  /**
24   * A sentence splitter. This is module contains a tokeniser, a
25   * gazetteer and a Jape grammar. This class is used so we can have a different
26   * entry in the creole.xml file describing the default resources and to add
27   * some minor processing after running the components in order to extract the
28   * results in a usable form.
29   */
30  public class SentenceSplitter extends AbstractLanguageAnalyser{
31  
32    public static final String
33      SPLIT_DOCUMENT_PARAMETER_NAME = "document";
34  
35    public static final String
36      SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
37  
38    public static final String
39      SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
40  
41    public static final String
42      SPLIT_ENCODING_PARAMETER_NAME = "encoding";
43  
44    public static final String
45      SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL";
46  
47    public static final String
48      SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL";
49  
50    public Resource init()throws ResourceInstantiationException{
51      //create all the componets
52      FeatureMap params;
53      FeatureMap features;
54  
55      //gazetteer
56      fireStatusChanged("Creating the gazetteer");
57      params = Factory.newFeatureMap();
58      if(gazetteerListsURL != null)
59        params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME,
60                                               gazetteerListsURL);
61      params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding);
62      features = Factory.newFeatureMap();
63      Gate.setHiddenAttribute(features, true);
64  
65  
66      gazetteer = (DefaultGazetteer)Factory.createResource(
67                      "gate.creole.gazetteer.DefaultGazetteer",
68                      params, features);
69      gazetteer.setName("Gazetteer " + System.currentTimeMillis());
70      fireProgressChanged(10);
71  
72      //transducer
73      fireStatusChanged("Creating the JAPE transducer");
74  
75      params = Factory.newFeatureMap();
76      if(transducerURL != null)
77        params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL);
78      params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
79      features = Factory.newFeatureMap();
80      Gate.setHiddenAttribute(features, true);
81  
82      transducer = (Transducer)Factory.createResource(
83                      "gate.creole.Transducer",
84                      params, features);
85      transducer.setName("Transducer " + System.currentTimeMillis());
86  
87      fireProgressChanged(100);
88      fireProcessFinished();
89  
90      return this;
91    }
92  
93    public void execute() throws ExecutionException{
94      interrupted = false;
95      //set the runtime parameters
96      FeatureMap params;
97      if(inputASName != null && inputASName.equals("")) inputASName = null;
98      if(outputASName != null && outputASName.equals("")) outputASName = null;
99      try{
100       fireProgressChanged(0);
101       params = Factory.newFeatureMap();
102       params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document);
103       params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName);
104       gazetteer.setParameterValues(params);
105 
106       params = Factory.newFeatureMap();
107       params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
108       params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName);
109       params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName);
110       transducer.setParameterValues(params);
111     }catch(Exception e){
112       throw new ExecutionException(e);
113     }
114     ProgressListener pListener = null;
115     StatusListener sListener = null;
116     fireProgressChanged(5);
117 
118     //run the gazetteer
119     if(isInterrupted()) throw new ExecutionInterruptedException(
120         "The execution of the \"" + getName() +
121         "\" sentence splitter has been abruptly interrupted!");
122     pListener = new IntervalProgressListener(5, 10);
123     sListener = new StatusListener(){
124       public void statusChanged(String text){
125         fireStatusChanged(text);
126       }
127     };
128     gazetteer.addProgressListener(pListener);
129     gazetteer.addStatusListener(sListener);
130     gazetteer.execute();
131     gazetteer.removeProgressListener(pListener);
132     gazetteer.removeStatusListener(sListener);
133 
134     //run the transducer
135     if(isInterrupted()) throw new ExecutionInterruptedException(
136         "The execution of the \"" + getName() +
137         "\" sentence splitter has been abruptly interrupted!");
138     pListener = new IntervalProgressListener(11, 90);
139     transducer.addProgressListener(pListener);
140     transducer.addStatusListener(sListener);
141     transducer.execute();
142     transducer.removeProgressListener(pListener);
143     transducer.removeStatusListener(sListener);
144 
145     //get pointers to the annotation sets
146     AnnotationSet inputAS = (inputASName == null) ?
147                             document.getAnnotations() :
148                             document.getAnnotations(inputASName);
149 
150     AnnotationSet outputAS = (outputASName == null) ?
151                              document.getAnnotations() :
152                              document.getAnnotations(outputASName);
153 
154     //copy the results to the output set if they are different
155     if(inputAS != outputAS){
156       outputAS.addAll(inputAS.get(SENTENCE_ANNOTATION_TYPE));
157     }
158 
159     //create one big sentence if none were found
160     AnnotationSet sentences = outputAS.get(SENTENCE_ANNOTATION_TYPE);
161     if(sentences == null || sentences.isEmpty()){
162       outputAS.add(outputAS.firstNode(), outputAS.lastNode(),
163                    SENTENCE_ANNOTATION_TYPE,
164                    Factory.newFeatureMap());;
165     }else{
166       //add a sentence covering all the tokens after the last sentence
167       Long endSentences = sentences.lastNode().getOffset();
168       AnnotationSet remainingTokens = inputAS.get(TOKEN_ANNOTATION_TYPE, endSentences,
169                                                   inputAS.lastNode().getOffset());
170       if(remainingTokens != null && !remainingTokens.isEmpty()){
171         try{
172           outputAS.add(remainingTokens.firstNode().getOffset(),
173                        remainingTokens.lastNode().getOffset(),
174                        SENTENCE_ANNOTATION_TYPE,
175                        Factory.newFeatureMap());
176         }catch(InvalidOffsetException ioe){
177           throw new ExecutionException(ioe);
178         }
179       }
180     }
181     fireProcessFinished();
182   }//execute()
183 
184   /**
185    * Notifies all the PRs in this controller that they should stop their
186    * execution as soon as possible.
187    */
188   public synchronized void interrupt(){
189     interrupted = true;
190     gazetteer.interrupt();
191     transducer.interrupt();
192   }
193 
194   public void setTransducerURL(java.net.URL newTransducerURL) {
195     transducerURL = newTransducerURL;
196   }
197   public java.net.URL getTransducerURL() {
198     return transducerURL;
199   }
200   DefaultGazetteer gazetteer;
201   Transducer transducer;
202   private java.net.URL transducerURL;
203   private String encoding;
204   private java.net.URL gazetteerListsURL;
205 
206 
207   public void setEncoding(String newEncoding) {
208     encoding = newEncoding;
209   }
210   public String getEncoding() {
211     return encoding;
212   }
213   public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) {
214     gazetteerListsURL = newGazetteerListsURL;
215   }
216   public java.net.URL getGazetteerListsURL() {
217     return gazetteerListsURL;
218   }
219   public void setInputASName(String newInputASName) {
220     inputASName = newInputASName;
221   }
222 
223   public String getInputASName() {
224     return inputASName;
225   }
226   public void setOutputASName(String newOutputASName) {
227     outputASName = newOutputASName;
228   }
229   public String getOutputASName() {
230     return outputASName;
231   }
232 
233 
234 
235   private static final boolean DEBUG = false;
236   private String inputASName;
237   private String outputASName;
238 }//public class SentenceSplitter extends Nerc