|
SentenceSplitter |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan, 01 Feb 2000 10 * 11 * $Id: SentenceSplitter.java,v 1.24 2002/03/06 17:15:44 kalina Exp $ 12 */ 13 14 15 package gate.creole.splitter; 16 17 import gate.*; 18 import gate.util.*; 19 import gate.event.*; 20 import gate.creole.tokeniser.*; 21 import gate.creole.gazetteer.*; 22 import gate.creole.*; 23 24 import java.util.*; 25 /** 26 * A sentence splitter. This is module similar to a 27 * {@link gate.creole.nerc.Nerc} in the fact that it conatins a tokeniser, a 28 * gazetteer and a Jape grammar. This class is used so we can have a different 29 * entry in the creole.xml file describing the default resources and to add 30 * some minor processing after running the components in order to extract the 31 * results in a usable form. 32 */ 33 public class SentenceSplitter extends AbstractLanguageAnalyser{ 34 35 public static final String 36 SPLIT_DOCUMENT_PARAMETER_NAME = "document"; 37 38 public static final String 39 SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName"; 40 41 public static final String 42 SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName"; 43 44 public static final String 45 SPLIT_ENCODING_PARAMETER_NAME = "encoding"; 46 47 public static final String 48 SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL"; 49 50 public static final String 51 SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL"; 52 53 public Resource init()throws ResourceInstantiationException{ 54 //create all the componets 55 FeatureMap params; 56 FeatureMap features; 57 58 //gazetteer 59 fireStatusChanged("Creating the gazetteer"); 60 params = Factory.newFeatureMap(); 61 if(gazetteerListsURL != null) 62 params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME, 63 gazetteerListsURL); 64 params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding); 65 features = Factory.newFeatureMap(); 66 Gate.setHiddenAttribute(features, true); 67 68 69 gazetteer = (DefaultGazetteer)Factory.createResource( 70 "gate.creole.gazetteer.DefaultGazetteer", 71 params, features); 72 gazetteer.setName("Gazetteer " + System.currentTimeMillis()); 73 fireProgressChanged(10); 74 75 //transducer 76 fireStatusChanged("Creating the JAPE transducer"); 77 78 params = Factory.newFeatureMap(); 79 if(transducerURL != null) 80 params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL); 81 params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding); 82 features = Factory.newFeatureMap(); 83 Gate.setHiddenAttribute(features, true); 84 85 transducer = (Transducer)Factory.createResource( 86 "gate.creole.Transducer", 87 params, features); 88 transducer.setName("Transducer " + System.currentTimeMillis()); 89 90 fireProgressChanged(100); 91 fireProcessFinished(); 92 93 return this; 94 } 95 96 public void execute() throws ExecutionException{ 97 interrupted = false; 98 //set the runtime parameters 99 FeatureMap params; 100 if(inputASName != null && inputASName.equals("")) inputASName = null; 101 if(outputASName != null && outputASName.equals("")) outputASName = null; 102 try{ 103 fireProgressChanged(0); 104 params = Factory.newFeatureMap(); 105 params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document); 106 params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName); 107 gazetteer.setParameterValues(params); 108 109 params = Factory.newFeatureMap(); 110 params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document); 111 params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName); 112 params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName); 113 transducer.setParameterValues(params); 114 }catch(Exception e){ 115 throw new ExecutionException(e); 116 } 117 ProgressListener pListener = null; 118 StatusListener sListener = null; 119 fireProgressChanged(5); 120 121 //run the gazetteer 122 if(isInterrupted()) throw new ExecutionInterruptedException( 123 "The execution of the \"" + getName() + 124 "\" sentence splitter has been abruptly interrupted!"); 125 pListener = new IntervalProgressListener(5, 10); 126 sListener = new StatusListener(){ 127 public void statusChanged(String text){ 128 fireStatusChanged(text); 129 } 130 }; 131 gazetteer.addProgressListener(pListener); 132 gazetteer.addStatusListener(sListener); 133 gazetteer.execute(); 134 gazetteer.removeProgressListener(pListener); 135 gazetteer.removeStatusListener(sListener); 136 137 //run the transducer 138 if(isInterrupted()) throw new ExecutionInterruptedException( 139 "The execution of the \"" + getName() + 140 "\" sentence splitter has been abruptly interrupted!"); 141 pListener = new IntervalProgressListener(11, 90); 142 transducer.addProgressListener(pListener); 143 transducer.addStatusListener(sListener); 144 transducer.execute(); 145 transducer.removeProgressListener(pListener); 146 transducer.removeStatusListener(sListener); 147 148 //get pointers to the annotation sets 149 AnnotationSet inputAS = (inputASName == null) ? 150 document.getAnnotations() : 151 document.getAnnotations(inputASName); 152 153 AnnotationSet outputAS = (outputASName == null) ? 154 document.getAnnotations() : 155 document.getAnnotations(outputASName); 156 157 //copy the results to the output set if they are different 158 if(inputAS != outputAS){ 159 outputAS.addAll(inputAS.get(SENTENCE_ANNOTATION_TYPE)); 160 } 161 162 //create one big sentence if none were found 163 AnnotationSet sentences = outputAS.get(SENTENCE_ANNOTATION_TYPE); 164 if(sentences == null || sentences.isEmpty()){ 165 outputAS.add(outputAS.firstNode(), outputAS.lastNode(), 166 SENTENCE_ANNOTATION_TYPE, 167 Factory.newFeatureMap());; 168 }else{ 169 //add a sentence covering all the tokens after the last sentence 170 Long endSentences = sentences.lastNode().getOffset(); 171 AnnotationSet remainingTokens = inputAS.get(TOKEN_ANNOTATION_TYPE, endSentences, 172 inputAS.lastNode().getOffset()); 173 if(remainingTokens != null && !remainingTokens.isEmpty()){ 174 try{ 175 outputAS.add(remainingTokens.firstNode().getOffset(), 176 remainingTokens.lastNode().getOffset(), 177 SENTENCE_ANNOTATION_TYPE, 178 Factory.newFeatureMap()); 179 }catch(InvalidOffsetException ioe){ 180 throw new ExecutionException(ioe); 181 } 182 } 183 } 184 fireProcessFinished(); 185 }//execute() 186 187 /** 188 * Notifies all the PRs in this controller that they should stop their 189 * execution as soon as possible. 190 */ 191 public synchronized void interrupt(){ 192 interrupted = true; 193 gazetteer.interrupt(); 194 transducer.interrupt(); 195 } 196 197 public void setTransducerURL(java.net.URL newTransducerURL) { 198 transducerURL = newTransducerURL; 199 } 200 public java.net.URL getTransducerURL() { 201 return transducerURL; 202 } 203 DefaultGazetteer gazetteer; 204 Transducer transducer; 205 private java.net.URL transducerURL; 206 private String encoding; 207 private java.net.URL gazetteerListsURL; 208 209 210 public void setEncoding(String newEncoding) { 211 encoding = newEncoding; 212 } 213 public String getEncoding() { 214 return encoding; 215 } 216 public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) { 217 gazetteerListsURL = newGazetteerListsURL; 218 } 219 public java.net.URL getGazetteerListsURL() { 220 return gazetteerListsURL; 221 } 222 public void setInputASName(String newInputASName) { 223 inputASName = newInputASName; 224 } 225 226 public String getInputASName() { 227 return inputASName; 228 } 229 public void setOutputASName(String newOutputASName) { 230 outputASName = newOutputASName; 231 } 232 public String getOutputASName() { 233 return outputASName; 234 } 235 236 237 238 private static final boolean DEBUG = false; 239 private String inputASName; 240 private String outputASName; 241 }//public class SentenceSplitter extends Nerc
|
SentenceSplitter |
|