1   package gate.creole.tokeniser.chinesetokeniser;
2   
3   /*
4    *  ChineseTokeniser.java
5    *
6    * Copyright (c) 1998-2004, The University of Sheffield.
7    *
8    * This file is part of GATE (see http://gate.ac.uk/), and is free
9    * software, licenced under the GNU Library General Public License,
10   * Version 2, June1991.
11   *
12   * A copy of this licence is included in the distribution in the file
13   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
14   *
15   *  m2na2, 13/10/2003
16   *
17   *  $Id: ChineseTokeniser.java,v 1.8 2004/07/21 17:10:06 akshay Exp $
18   */
19  
20  import java.util.*;
21  
22  import gate.*;
23  import gate.creole.*;
24  import gate.creole.tokeniser.SimpleTokeniser;
25  import gate.util.*;
26  
27  /**
28   * <p>Title: ChineseTokeniser.java </p>
29   * <p>Description: This class is a wrapper for segmenter.</p>
30   * <p> Tokenises a Chinese document using the Chinesse segmenter</p>
31   * @author Niraj Aswani
32   * @version 1.0
33   */
34  public class ChineseTokeniser
35      extends AbstractLanguageAnalyser
36      implements ProcessingResource {
37  
38    /** Instance of segmenter */
39    private Segmenter segmenter;
40  
41    /** The name of the encoding used */
42    private String encoding;
43  
44    /** The name of the sourceFile */
45    private gate.Document document;
46  
47    /** Temporary document */
48    private gate.Document tempDoc;
49  
50    /** Instance of Simple Tokenizer */
51    private SimpleTokeniser tokeniser;
52  
53    /** Boolean value which states if segmenter should run */
54    private Boolean runSegmenter;
55  
56    /** Boolean value which says if tokeniser has to generate the spack tokens */
57    private Boolean generateSpaceTokens;
58  
59    /** Rules for the simple tokeniser */
60    private java.net.URL rulesURL;
61  
62    private String annotationSetName;
63  
64    private int charform;
65  
66    /** Default Constructor */
67    public ChineseTokeniser() {
68  
69    }
70  
71    public Resource init() throws ResourceInstantiationException {
72      fireProgressChanged(0);
73      fireStatusChanged("Loading Data Files...");
74      // check the encoging parameters
75      if (encoding == null) {
76        // setting the default parameter for encoding
77        encoding = "UTF8";
78      }
79      else {
80        if (encoding.equals("BIG5")) {
81          charform = Segmenter.TRAD;
82        }
83        else if (encoding.equals("GBK")) {
84          charform = Segmenter.SIMP;
85        }
86        else if (encoding.equals("UTF8")) {
87          charform = Segmenter.BOTH;
88        }
89        else {
90          // setting the default parameter for encoding
91          encoding = "UTF8";
92          charform = Segmenter.BOTH;
93        }
94      }
95  
96      if (rulesURL == null) {
97        throw new ResourceInstantiationException(
98            "No URL provided for the tokeniser rules");
99      }
100     // creating instance of segmenter
101     segmenter = new Segmenter(charform, true);
102     fireProcessFinished();
103 
104     // returning the current resource
105     return this;
106   }
107 
108   /** This method reInitialises the segmenter */
109   public void reInit() throws ResourceInstantiationException {
110     segmenter = new Segmenter(charform, true);
111   }
112 
113   /**
114    * This method gets executed whenever user clicks on the Run button
115    * available in the GATE gui.  It runs the segmenter on the given document
116    * and segments the text by addting spaces or space tokens with 0-length
117    * character (depends on the value of generateSpaceTokens selected by the
118    * user at run time).
119    * @throws ExecutionException
120    */
121   public void execute() throws ExecutionException {
122     // lets start the progress and initialize the progress counter
123     fireProgressChanged(0);
124 
125     // If no document provided to process throw an exception
126     if (document == null) {
127       throw new GateRuntimeException("No document to process!");
128     }
129 
130     String segmentedData = null;
131 
132     // Segmenter should only run if runSegmenter value is true
133     if (runSegmenter.booleanValue()) {
134       // run the segmenter on this text
135       segmentedData = segmenter.segmentData(
136           document.getContent().toString(),
137           encoding);
138     } else {
139       segmentedData = document.getContent().toString();
140     }
141 
142     if(encoding.equals("UTF8")) {
143       encoding = "UTF-8";
144     }
145 
146     // now we need to create a temporary document
147     // and copy all the contents of sourceDoc to this temporary document
148     // so that we'll provide this document to the segmenter
149     try {
150       FeatureMap params = Factory.newFeatureMap();
151       params.put("stringContent", segmentedData);
152       FeatureMap features = Factory.newFeatureMap();
153 
154       // we need to hide the creation of new document on the GUI screen
155       Gate.setHiddenAttribute(features, true);
156 
157       tempDoc = (Document) Factory.createResource("gate.corpora.DocumentImpl",
158                                                   params, features);
159     }
160     catch (ResourceInstantiationException rie) {
161       throw new ExecutionException("Temporary document cannot be created");
162     }
163 
164     // and the get the marks where the spaces in the original document
165     // were added
166     ArrayList marks = null;
167 
168     // Segmenter should only run if runSegmenter value is true
169     if (runSegmenter.booleanValue()) {
170       marks = segmenter.getMarks();
171     }
172 
173     // we need to run the Simple Tokenizer on this
174     FeatureMap features = Factory.newFeatureMap();
175     Gate.setHiddenAttribute(features, true);
176 
177     // set the parameters
178     FeatureMap params = Factory.newFeatureMap();
179     params.put("rulesURL", rulesURL);
180     params.put("encoding", encoding);
181     params.put("document", tempDoc);
182     params.put("annotationSetName", annotationSetName);
183 
184     try {
185       tokeniser = (gate.creole.tokeniser.SimpleTokeniser) Factory.
186           createResource(
187           "gate.creole.tokeniser.SimpleTokeniser", params, features);
188     }
189     catch (ResourceInstantiationException rie) {
190       throw new ExecutionException(
191           "Instance of SimpleTokeniser cannot be created");
192     }
193 
194     // so now run the tokeniser
195     tokeniser.execute();
196 
197     // so space tokens have been added, now we need to map all the newly added
198     // features from tokeniser to the new document to the original one
199     AnnotationSet anns;
200     AnnotationSet original;
201 
202     if(annotationSetName == null || annotationSetName.length() == 0) {
203       anns = tempDoc.getAnnotations();
204       original = document.getAnnotations();
205     } else {
206       anns = tempDoc.getAnnotations(annotationSetName);
207       original = document.getAnnotations(annotationSetName);
208     }
209 
210     List tokens = new ArrayList(anns.get());
211     Comparator offsetComparator = new OffsetComparator();
212     Collections.sort(tokens, offsetComparator);
213     Iterator tokenIter = tokens.iterator();
214 
215 
216     // to make the process faster, lets copy all the marks into the long array
217     long[] markValues = (runSegmenter.booleanValue())? new long[marks.size()] : null;
218     if(markValues != null) {
219       for (int i = 0; i < marks.size(); i++) {
220         markValues[i] = ( (Long) marks.get(i)).longValue();
221       }
222       Arrays.sort(markValues);
223     }
224 
225     // and finally transfer the annotations
226     while (tokenIter.hasNext()) {
227       Annotation currentToken = ( (Annotation) tokenIter.next());
228       long startOffset =
229           currentToken.getStartNode().getOffset().longValue();
230       long endOffset =
231           currentToken.getEndNode().getOffset().longValue();
232 
233       // search how many chinese splits are before the current annotation
234       int index = (markValues == null) ? -1 : Arrays.binarySearch(markValues, startOffset);
235       if (index >= 0) {
236         // it is a chinese split
237         if (generateSpaceTokens.booleanValue()) {
238           try {
239             FeatureMap newFeatures = Factory.newFeatureMap();
240             newFeatures.put("kind", "ChineseSplit");
241             original.add(new Long(startOffset - index),
242                          new Long(startOffset - index),
243                          SPACE_TOKEN_ANNOTATION_TYPE, newFeatures);
244           }
245           catch (InvalidOffsetException ioe) {
246             throw new ExecutionException("Offset Error");
247           }
248         }
249 
250       }
251       else {
252         index = Math.abs(index) - 1;
253 
254         // it is not a chinese split but some other token
255         // lets add this annotation to the original document
256         String annotSetName = currentToken.getType();
257         FeatureMap newFeatureMap = currentToken.getFeatures();
258         try {
259           original.add(new Long(startOffset - index),
260                        new Long(endOffset - index), annotSetName,
261                        newFeatureMap);
262         }
263         catch (InvalidOffsetException ioe) {
264           throw new ExecutionException(
265               "Problem with the invalid offset while adding annotations" +
266               "to the original document");
267         }
268       }
269     }
270     // and finally remove the temporary Document
271     Factory.deleteResource(tempDoc);
272 
273     // process finished, acknowledge user about this.
274     fireProcessFinished();
275   }
276 
277   // getter and setter method
278 
279   /**
280    * Sets the boolean parameter which states if segmenter should run
281    * @param runSegmenter
282    */
283   public void setRunSegmenter(Boolean runSegmenter) {
284     this.runSegmenter = runSegmenter;
285   }
286 
287 
288   /** Gets the boolean parameter which states if segmenter should run
289    */
290   public Boolean getRunSegmenter() {
291     return this.runSegmenter;
292   }
293 
294   /**
295    * Sets the boolean parameter which states if segmenter should produce
296    * the space tokens
297    */
298   public void setGenerateSpaceTokens(Boolean value) {
299     this.generateSpaceTokens = value;
300   }
301 
302   /**
303    * Gets the boolean parameter which states if segmenter should produce
304    * the space tokens
305    */
306   public Boolean getGenerateSpaceTokens() {
307     return this.generateSpaceTokens;
308   }
309 
310   /**
311    * Sets the document to be processed
312    * @param document - document to be processed
313    */
314   public void setDocument(gate.Document document) {
315     this.document = document;
316   }
317 
318   /**
319    * Returns the document under process
320    */
321   public gate.Document getDocument() {
322     return this.document;
323   }
324 
325   /**
326    * Sets the encoding to be used.
327    * @param encoding the encoding.
328    */
329   public void setEncoding(String encoding) {
330     this.encoding = encoding;
331   }
332 
333   /**
334    * Returns the document under process
335    */
336   public String getEncoding() {
337     return this.encoding;
338   }
339 
340   /**
341    * URL for the file, which contains rules to be given to the tokeniser
342    * @param rules
343    */
344   public void setRulesURL(java.net.URL rules) {
345     this.rulesURL = rules;
346   }
347 
348   /**
349    * Returns the URL of the file, which contains rules for the tokeniser
350    * @return a {@link java.net.URL} value.
351    */
352   public java.net.URL getRulesURL() {
353     return rulesURL;
354   }
355 
356   /**
357    * AnnotationSet name
358    * @param name Name of the annotation
359    */
360   public void setAnnotationSetName(String name) {
361     this.annotationSetName = name;
362   }
363 
364   /**
365    * Returns the provided annotationset name
366    * @return a {@link String} value.
367    */
368   public String getAnnotationSetName() {
369     return this.annotationSetName;
370   }
371 }