1   /*
2    *  DefaultTokeniser.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Valentin Tablan, 2000
12   *
13   *  $Id: SimpleTokeniser.java,v 1.14 2003/03/10 13:10:54 valyt Exp $
14   */
15  
16  package gate.creole.tokeniser;
17  
18  import java.util.*;
19  import java.io.*;
20  import java.net.*;
21  import java.lang.reflect.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.event.*;
26  import gate.util.*;
27  
28  //import EDU.auburn.VGJ.graph.ParseError;
29  
30  /** Implementation of a Unicode rule based tokeniser.
31   * The tokeniser gets its rules from a file an {@link java.io.InputStream
32   * InputStream} or a {@link java.io.Reader Reader} which should be sent to one
33   * of the constructors.
34   * The implementations is based on a finite state machine that is built based
35   * on the set of rules.
36   * A rule has two sides, the left hand side (LHS)and the right hand side (RHS)
37   * that are separated by the ">" character. The LHS represents a
38   * regular expression that will be matched against the input while the RHS
39   * describes a Gate2 annotation in terms of annotation type and attribute-value
40   * pairs.
41   * The matching is done using Unicode enumarated types as defined by the {@link
42   * java.lang.Character Character} class. At the time of writing this class the
43   * suported Unicode categories were:
44   * <ul>
45   * <li>UNASSIGNED
46   * <li>UPPERCASE_LETTER
47   * <li>LOWERCASE_LETTER
48   * <li>TITLECASE_LETTER
49   * <li>MODIFIER_LETTER
50   * <li>OTHER_LETTER
51   * <li>NON_SPACING_MARK
52   * <li>ENCLOSING_MARK
53   * <li>COMBINING_SPACING_MARK
54   * <li>DECIMAL_DIGIT_NUMBER
55   * <li>LETTER_NUMBER
56   * <li>OTHER_NUMBER
57   * <li>SPACE_SEPARATOR
58   * <li>LINE_SEPARATOR
59   * <li>PARAGRAPH_SEPARATOR
60   * <li>CONTROL
61   * <li>FORMAT
62   * <li>PRIVATE_USE
63   * <li>SURROGATE
64   * <li>DASH_PUNCTUATION
65   * <li>START_PUNCTUATION
66   * <li>END_PUNCTUATION
67   * <li>CONNECTOR_PUNCTUATION
68   * <li>OTHER_PUNCTUATION
69   * <li>MATH_SYMBOL
70   * <li>CURRENCY_SYMBOL
71   * <li>MODIFIER_SYMBOL
72   * <li>OTHER_SYMBOL
73   * </ul>
74   * The accepted operators for the LHS are "+", "*" and "|" having the usual
75   * interpretations of "1 to n occurences", "0 to n occurences" and
76   * "boolean OR".
77   * For instance this is a valid LHS:
78   * <br>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+
79   * <br>meaning an uppercase letter followed by one or more lowercase letters.
80   *
81   * The RHS describes an annotation that is to be created and inserted in the
82   * annotation set provided in case of a match. The new annotation will span the
83   * text that has been recognised. The RHS consists in the annotation type
84   * followed by pairs of attributes and associated values.
85   * E.g. for the LHS above a possible RHS can be:<br>
86   * Token;kind=upperInitial;<br>
87   * representing an annotation of type &quot;Token&quot; having one attribute
88   * named &quot;kind&quot; with the value &quot;upperInitial&quot;<br>
89   * The entire rule willbe:<br>
90   * <pre>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;</pre>
91   * <br>
92   * The tokeniser ignores all the empty lines or the ones that start with # or
93   * //.
94   *
95   */
96  public class SimpleTokeniser extends AbstractLanguageAnalyser{
97    public static final String
98      SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";
99  
100   public static final String
101     SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
102 
103   public static final String
104     SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";
105 
106   public static final String
107     SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";
108 
109   /** Debug flag
110    */
111   private static final boolean DEBUG = false;
112 
113   /**
114    * Creates a tokeniser
115    */
116   public SimpleTokeniser(){
117   }
118 
119   /**
120    * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building
121    * the finite state machine at the core of the tokeniser.
122    *
123    * @exception ResourceInstantiationException
124    */
125   public Resource init() throws ResourceInstantiationException{
126     Reader rulesReader;
127     try{
128       if(rulesURL != null){
129         rulesReader = new InputStreamReader(rulesURL.openStream(), encoding);
130       }else{
131         //no init data, Scream!
132         throw new ResourceInstantiationException(
133           "No URL provided for the rules!");
134       }
135       initialState = new FSMState(this);
136       BufferedReader bRulesReader = new BufferedReader(rulesReader);
137       String line = bRulesReader.readLine();
138       ///String toParse = "";
139       StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE);
140 
141       while (line != null){
142         if(line.endsWith("\\")){
143           ///toParse += line.substring(0,line.length()-1);
144           toParse.append(line.substring(0,line.length()-1));
145         }else{
146           /*toParse += line;
147           parseRule(toParse);
148           toParse = "";
149           */
150           toParse.append(line);
151           parseRule(toParse.toString());
152           toParse.delete(0,toParse.length());
153         }
154         line = bRulesReader.readLine();
155       }
156       eliminateVoidTransitions();
157     }catch(java.io.IOException ioe){
158       throw new ResourceInstantiationException(ioe);
159     }catch(TokeniserException te){
160       throw new ResourceInstantiationException(te);
161     }
162     return this;
163   }
164 
165   /**
166    * Prepares this Processing resource for a new run.
167    */
168   public void reset(){
169     document = null;
170   }
171 
172   /** Parses one input line containing a tokeniser rule.
173    * This will create the necessary FSMState objects and the links
174    * between them.
175    *
176    * @param line the string containing the rule
177    */
178   void parseRule(String line)throws TokeniserException{
179     //ignore comments
180     if(line.startsWith("#")) return;
181 
182     if(line.startsWith("//")) return;
183 
184     StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
185     FSMState newState = new FSMState(this);
186 
187     initialState.put(null, newState);
188     FSMState finalState = parseLHS(newState, st, LHStoRHS);
189     String rhs = "";
190 
191     if(st.hasMoreTokens()) rhs = st.nextToken("\f");
192 
193     if(rhs.length() > 0)finalState.setRhs(rhs);
194   } // parseRule
195 
196   /** Parses a part or the entire LHS.
197    *
198    * @param startState a FSMState object representing the initial state for
199    *     the small FSM that will recognise the (part of) the rule parsed by this
200    *     method.
201    * @param st a {@link java.util.StringTokenizer StringTokenizer} that
202    *     provides the input
203    * @param until the string that marks the end of the section to be
204    *     recognised. This method will first be called by {@link
205    *     #parseRule(String)} with &quot; &gt;&quot; in order to parse the entire
206    *     LHS. when necessary it will make itself another call to {@link #parseLHS
207    *     parseLHS} to parse a region of the LHS (e.g. a
208    *     &quot;(&quot;,&quot;)&quot; enclosed part.
209    */
210   FSMState parseLHS(FSMState startState, StringTokenizer st, String until)
211        throws TokeniserException{
212 
213     FSMState currentState = startState;
214     boolean orFound = false;
215     List orList = new LinkedList();
216     String token;
217     token = skipIgnoreTokens(st);
218 
219     if(null == token) return currentState;
220 
221     FSMState newState;
222     Integer typeId;
223     UnicodeType uType;
224 
225     bigwhile: while(!token.equals(until)){
226       if(token.equals("(")){//(..)
227         newState = parseLHS(currentState, st,")");
228       } else if(token.equals("\"")){//"unicode_type"
229         String sType = parseQuotedString(st, "\"");
230         newState = new FSMState(this);
231         typeId = (Integer)stringTypeIds.get(sType);
232 
233         if(null == typeId)
234           throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
235         else uType = new UnicodeType(typeId.intValue());
236 
237         currentState.put(uType ,newState);
238       } else {// a type with no quotes
239         String sType = token;
240         newState = new FSMState(this);
241         typeId = (Integer)stringTypeIds.get(sType);
242 
243         if(null == typeId)
244           throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
245         else uType = new UnicodeType(typeId.intValue());
246 
247         currentState.put(uType ,newState);
248       }
249       //treat the operators
250       token = skipIgnoreTokens(st);
251       if(null == token) throw
252         new InvalidRuleException("Tokeniser rule ended too soon!");
253 
254       if(token.equals("|")) {
255 
256         orFound = true;
257         orList.add(newState);
258         token = skipIgnoreTokens(st);
259         if(null == token) throw
260           new InvalidRuleException("Tokeniser rule ended too soon!");
261 
262         continue bigwhile;
263       } else if(orFound) {//done parsing the "|"
264         orFound = false;
265         orList.add(newState);
266         newState = new FSMState(this);
267         Iterator orListIter = orList.iterator();
268 
269         while(orListIter.hasNext())
270           ((FSMState)orListIter.next()).put(null, newState);
271         orList.clear();
272       }
273 
274       if(token.equals("+")) {
275 
276         newState.put(null,currentState);
277         currentState = newState;
278         newState = new FSMState(this);
279         currentState.put(null,newState);
280         token = skipIgnoreTokens(st);
281 
282         if(null == token) throw
283           new InvalidRuleException("Tokeniser rule ended too soon!");
284       } else if(token.equals("*")) {
285 
286         currentState.put(null,newState);
287         newState.put(null,currentState);
288         currentState = newState;
289         newState = new FSMState(this);
290         currentState.put(null,newState);
291         token = skipIgnoreTokens(st);
292 
293         if(null == token) throw
294           new InvalidRuleException("Tokeniser rule ended too soon!");
295       }
296       currentState = newState;
297     }
298     return currentState;
299   } // parseLHS
300 
301   /** Parses from the given string tokeniser until it finds a specific
302    * delimiter.
303    * One use for this method is to read everything until the first quote.
304    *
305    * @param st a {@link java.util.StringTokenizer StringTokenizer} that
306    *     provides the input
307    * @param until a String representing the end delimiter.
308    */
309   String parseQuotedString(StringTokenizer st, String until)
310     throws TokeniserException {
311 
312     String token;
313 
314     if(st.hasMoreElements()) token = st.nextToken();
315     else return null;
316 
317     ///String type = "";
318     StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE);
319 
320     while(!token.equals(until)){
321       //type += token;
322       type.append(token);
323       if(st.hasMoreElements())token = st.nextToken();
324       else throw new InvalidRuleException("Tokeniser rule ended too soon!");
325     }
326     return type.toString();
327   } // parseQuotedString
328 
329   /** Skips the ignorable tokens from the input returning the first significant
330    * token.
331    * The ignorable tokens are defined by {@link #ignoreTokens a set}
332    */
333   protected static String skipIgnoreTokens(StringTokenizer st){
334     Iterator ignorables;
335     boolean ignorableFound = false;
336     String currentToken;
337 
338     while(true){
339       if(st.hasMoreTokens()){
340         currentToken = st.nextToken();
341         ignorables = ignoreTokens.iterator();
342         ignorableFound = false;
343 
344         while(!ignorableFound && ignorables.hasNext()){
345           if(currentToken.equals((String)ignorables.next()))
346             ignorableFound = true;
347         }
348 
349         if(!ignorableFound) return currentToken;
350       } else return null;
351     }
352   }//skipIgnoreTokens
353 
354   /* Computes the lambda-closure (aka epsilon closure) of the given set of
355    * states, that is the set of states that are accessible from any of the
356    * states in the given set using only unrestricted transitions.
357    * @return a set containing all the states accessible from this state via
358    * transitions that bear no restrictions.
359    */
360   /**
361    * Converts the finite state machine to a deterministic one.
362    *
363    * @param s
364    */
365   private AbstractSet lambdaClosure(Set s){
366 
367     //the stack/queue used by the algorithm
368     LinkedList list = new LinkedList(s);
369 
370     //the set to be returned
371     AbstractSet lambdaClosure = new HashSet(s);
372 
373     FSMState top;
374     FSMState currentState;
375     Set nextStates;
376     Iterator statesIter;
377 
378     while(!list.isEmpty()) {
379       top = (FSMState)list.removeFirst();
380       nextStates = top.nextSet(null);
381 
382       if(null != nextStates){
383         statesIter = nextStates.iterator();
384 
385         while(statesIter.hasNext()) {
386           currentState = (FSMState)statesIter.next();
387           if(!lambdaClosure.contains(currentState)){
388             lambdaClosure.add(currentState);
389             list.addFirst(currentState);
390           }//if(!lambdaClosure.contains(currentState))
391         }//while(statesIter.hasNext())
392 
393       }//if(null != nextStates)
394     }
395     return lambdaClosure;
396   } // lambdaClosure
397 
398   /** Converts the FSM from a non-deterministic to a deterministic one by
399    * eliminating all the unrestricted transitions.
400    */
401   void eliminateVoidTransitions() throws TokeniserException {
402 
403     //kalina:clear() faster than init() which is called with init()
404     newStates.clear();
405     Set sdStates = new HashSet();
406     LinkedList unmarkedDStates = new LinkedList();
407     DFSMState dCurrentState = new DFSMState(this);
408     Set sdCurrentState = new HashSet();
409 
410     sdCurrentState.add(initialState);
411     sdCurrentState = lambdaClosure(sdCurrentState);
412     newStates.put(sdCurrentState, dCurrentState);
413     sdStates.add(sdCurrentState);
414 
415     //find out if the new state is a final one
416     Iterator innerStatesIter = sdCurrentState.iterator();
417     String rhs;
418     FSMState currentInnerState;
419     Set rhsClashSet = new HashSet();
420     boolean newRhs = false;
421 
422     while(innerStatesIter.hasNext()){
423       currentInnerState = (FSMState)innerStatesIter.next();
424       if(currentInnerState.isFinal()){
425         rhs = currentInnerState.getRhs();
426         rhsClashSet.add(rhs);
427         dCurrentState.rhs = rhs;
428         newRhs = true;
429       }
430     }
431 
432     if(rhsClashSet.size() > 1){
433       Err.println("Warning, rule clash: " +  rhsClashSet +
434                          "\nSelected last definition: " + dCurrentState.rhs);
435     }
436 
437     if(newRhs)dCurrentState.buildTokenDesc();
438     rhsClashSet.clear();
439     unmarkedDStates.addFirst(sdCurrentState);
440     dInitialState = dCurrentState;
441     Set nextSet;
442 
443     while(!unmarkedDStates.isEmpty()){
444       //Out.println("\n\n=====================" + unmarkedDStates.size());
445       sdCurrentState = (Set)unmarkedDStates.removeFirst();
446       for(int type = 0; type < maxTypeId; type++){
447       //Out.print(type);
448         nextSet = new HashSet();
449         innerStatesIter = sdCurrentState.iterator();
450 
451         while(innerStatesIter.hasNext()){
452           currentInnerState = (FSMState)innerStatesIter.next();
453           Set tempSet = currentInnerState.nextSet(type);
454           if(null != tempSet) nextSet.addAll(tempSet);
455         }//while(innerStatesIter.hasNext())
456 
457         if(!nextSet.isEmpty()){
458           nextSet = lambdaClosure(nextSet);
459           dCurrentState = (DFSMState)newStates.get(nextSet);
460 
461           if(dCurrentState == null){
462 
463             //we have a new DFSMState
464             dCurrentState = new DFSMState(this);
465             sdStates.add(nextSet);
466             unmarkedDStates.add(nextSet);
467 
468             //check to see whether the new state is a final one
469             innerStatesIter = nextSet.iterator();
470             newRhs =false;
471 
472             while(innerStatesIter.hasNext()){
473               currentInnerState = (FSMState)innerStatesIter.next();
474               if(currentInnerState.isFinal()){
475                 rhs = currentInnerState.getRhs();
476                 rhsClashSet.add(rhs);
477                 dCurrentState.rhs = rhs;
478                 newRhs = true;
479               }
480             }
481 
482             if(rhsClashSet.size() > 1){
483               Err.println("Warning, rule clash: " +  rhsClashSet +
484                             "\nSelected last definition: " + dCurrentState.rhs);
485             }
486 
487             if(newRhs)dCurrentState.buildTokenDesc();
488             rhsClashSet.clear();
489             newStates.put(nextSet, dCurrentState);
490           }
491           ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState);
492         } // if(!nextSet.isEmpty())
493 
494       } // for(byte type = 0; type < 256; type++)
495 
496     } // while(!unmarkedDStates.isEmpty())
497 
498   } // eliminateVoidTransitions
499 
500   /** Returns a string representation of the non-deterministic FSM graph using
501    * GML (Graph modelling language).
502    */
503   public String getFSMgml(){
504     String res = "graph[ \ndirected 1\n";
505     ///String nodes = "", edges = "";
506     StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
507                  edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
508 
509     Iterator fsmStatesIter = fsmStates.iterator();
510     while (fsmStatesIter.hasNext()){
511       FSMState currentState = (FSMState)fsmStatesIter.next();
512       int stateIndex = currentState.getIndex();
513       /*nodes += "node[ id " + stateIndex +
514                " label \"" + stateIndex;
515         */
516         nodes.append("node[ id ");
517         nodes.append(stateIndex);
518         nodes.append(" label \"");
519         nodes.append(stateIndex);
520 
521              if(currentState.isFinal()){
522               ///nodes += ",F\\n" + currentState.getRhs();
523               nodes.append(",F\\n" + currentState.getRhs());
524              }
525              ///nodes +=  "\"  ]\n";
526              nodes.append("\"  ]\n");
527       ///edges += currentState.getEdgesGML();
528       edges.append(currentState.getEdgesGML());
529     }
530     res += nodes.toString() + edges.toString() + "]\n";
531     return res;
532   } // getFSMgml
533 
534   /** Returns a string representation of the deterministic FSM graph using
535    * GML.
536    */
537   public String getDFSMgml() {
538     String res = "graph[ \ndirected 1\n";
539     ///String nodes = "", edges = "";
540     StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
541                  edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
542 
543     Iterator dfsmStatesIter = dfsmStates.iterator();
544     while (dfsmStatesIter.hasNext()) {
545       DFSMState currentState = (DFSMState)dfsmStatesIter.next();
546       int stateIndex = currentState.getIndex();
547 /*      nodes += "node[ id " + stateIndex +
548                " label \"" + stateIndex;
549 */
550         nodes.append("node[ id ");
551         nodes.append(stateIndex);
552         nodes.append(" label \"");
553         nodes.append(stateIndex);
554 
555              if(currentState.isFinal()){
556 ///              nodes += ",F\\n" + currentState.getRhs();
557               nodes.append(",F\\n" + currentState.getRhs());
558              }
559 ///             nodes +=  "\"  ]\n";
560              nodes.append("\"  ]\n");
561 ///      edges += currentState.getEdgesGML();
562         edges.append(currentState.getEdgesGML());
563     }
564     res += nodes.toString() + edges.toString() + "]\n";
565     return res;
566   } // getDFSMgml
567 
568   //no doc required: javadoc will copy it from the interface
569   /**    */
570   public FeatureMap getFeatures(){
571     return features;
572   } // getFeatures
573 
574   /**    */
575   public void setFeatures(FeatureMap features){
576     this.features = features;
577   } // setFeatures
578 
579   /**
580    * The method that does the actual tokenisation.
581    */
582   public void execute() throws ExecutionException {
583     interrupted = false;
584     AnnotationSet annotationSet;
585     //check the input
586     if(document == null) {
587       throw new ExecutionException(
588         "No document to tokenise!"
589       );
590     }
591 
592     if(annotationSetName == null ||
593        annotationSetName.equals("")) annotationSet = document.getAnnotations();
594     else annotationSet = document.getAnnotations(annotationSetName);
595 
596     fireStatusChanged(
597         "Tokenising " + document.getName() + "...");
598 
599     String content = document.getContent().toString();
600     int length = content.length();
601     char currentChar;
602 
603     DFSMState graphPosition = dInitialState;
604 
605     //the index of the first character of the token trying to be recognised
606     int tokenStart = 0;
607 
608     //the index of the last character of the last token recognised
609     int lastMatch = -1;
610 
611     DFSMState lastMatchingState = null;
612     DFSMState nextState;
613     String tokenString;
614     int charIdx = 0;
615     int oldCharIdx = 0;
616     FeatureMap newTokenFm;
617 
618     while(charIdx < length){
619       currentChar = content.charAt(charIdx);
620 //      Out.println(
621 //      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
622       nextState = graphPosition.next(((Integer)typeIds.get(
623                   new Integer(Character.getType(currentChar)))).intValue());
624 
625       if( null != nextState ) {
626         graphPosition = nextState;
627         if(graphPosition.isFinal()) {
628           lastMatch = charIdx;
629           lastMatchingState = graphPosition;
630         }
631         charIdx ++;
632       } else {//we have a match!
633         newTokenFm = Factory.newFeatureMap();
634 
635         if (null == lastMatchingState) {
636           tokenString = content.substring(tokenStart, tokenStart +1);
637           newTokenFm.put("type","UNKNOWN");
638           newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
639           newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
640                          Integer.toString(tokenString.length()));
641 
642           try {
643             annotationSet.add(new Long(tokenStart),
644                               new Long(tokenStart + 1),
645                               "DEFAULT_TOKEN", newTokenFm);
646           } catch (InvalidOffsetException ioe) {
647             //This REALLY shouldn't happen!
648             ioe.printStackTrace(Err.getPrintWriter());
649           }
650           // Out.println("Default token: " + tokenStart +
651           //             "->" + tokenStart + " :" + tokenString + ";");
652           charIdx  = tokenStart + 1;
653         } else {
654           tokenString = content.substring(tokenStart, lastMatch + 1);
655           newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
656           newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
657                          Integer.toString(tokenString.length()));
658 
659           for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
660             newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
661                            lastMatchingState.getTokenDesc()[i][1]);
662           //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
663           //                       lastMatchingState.getTokenDesc()[i][1]);
664           }
665 
666 
667           try {
668             annotationSet.add(new Long(tokenStart),
669                             new Long(lastMatch + 1),
670                             lastMatchingState.getTokenDesc()[0][0], newTokenFm);
671           } catch(InvalidOffsetException ioe) {
672             //This REALLY shouldn't happen!
673             throw new GateRuntimeException(ioe.toString());
674           }
675 
676           // Out.println(lastMatchingState.getTokenDesc()[0][0] +
677           //              ": " + tokenStart + "->" + lastMatch +
678           //              " :" + tokenString + ";");
679           charIdx = lastMatch + 1;
680         }
681 
682         lastMatchingState = null;
683         graphPosition = dInitialState;
684         tokenStart = charIdx;
685       }
686 
687       if((charIdx - oldCharIdx > 256)){
688         fireProgressChanged((100 * charIdx )/ length );
689         oldCharIdx = charIdx;
690         if(isInterrupted()) throw new ExecutionInterruptedException();
691       }
692 
693     } // while(charIdx < length)
694 
695     if (null != lastMatchingState) {
696       tokenString = content.substring(tokenStart, lastMatch + 1);
697       newTokenFm = Factory.newFeatureMap();
698       newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
699       newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
700                      Integer.toString(tokenString.length()));
701 
702       for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
703         newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
704                        lastMatchingState.getTokenDesc()[i][1]);
705       }
706 
707 
708       try {
709         annotationSet.add(new Long(tokenStart),
710                           new Long(lastMatch + 1),
711                           lastMatchingState.getTokenDesc()[0][0], newTokenFm);
712       } catch(InvalidOffsetException ioe) {
713         //This REALLY shouldn't happen!
714         throw new GateRuntimeException(ioe.toString());
715       }
716 
717     }
718 
719     reset();
720     fireProcessFinished();
721     fireStatusChanged("Tokenisation complete!");
722   } // run
723 
724   /**
725    * Sets the value of the <code>rulesURL</code> property which holds an URL
726    * to the file containing the rules for this tokeniser.
727    * @param newRulesURL
728    */
729   public void setRulesURL(java.net.URL newRulesURL) {
730     rulesURL = newRulesURL;
731   }
732   /**
733    * Gets the value of the <code>rulesURL</code> property hich holds an
734    * URL to the file containing the rules for this tokeniser.
735    */
736   public java.net.URL getRulesURL() {
737     return rulesURL;
738   }
739   /**    */
740   public void setAnnotationSetName(String newAnnotationSetName) {
741     annotationSetName = newAnnotationSetName;
742   }
743   /**    */
744   public String getAnnotationSetName() {
745     return annotationSetName;
746   }
747   public void setRulesResourceName(String newRulesResourceName) {
748     rulesResourceName = newRulesResourceName;
749   }
750   public String getRulesResourceName() {
751     return rulesResourceName;
752   }
753   public void setEncoding(String newEncoding) {
754     encoding = newEncoding;
755   }
756   public String getEncoding() {
757     return encoding;
758   }
759 
760   /**    */
761   protected FeatureMap features  = null;
762 
763   /** the annotations et where the new annotations will be adde
764    */
765   protected String annotationSetName;
766 
767   /** The initial state of the non deterministic machin
768    */
769   protected FSMState initialState;
770 
771   /** A set containng all the states of the non deterministic machin
772    */
773   protected Set fsmStates = new HashSet();
774 
775   /** The initial state of the deterministic machin
776    */
777   protected DFSMState dInitialState;
778 
779   /** A set containng all the states of the deterministic machin
780    */
781   protected Set dfsmStates = new HashSet();
782 
783   /** The separator from LHS to RH
784    */
785   static String LHStoRHS = ">";
786 
787   /** A set of string representing tokens to be ignored (e.g. blanks
788    */
789   static Set ignoreTokens;
790 
791   /** maps from int (the static value on {@link java.lang.Character} to int
792    * the internal value used by the tokeniser. The ins values used by the
793    * tokeniser are consecutive values, starting from 0 and going as high as
794    * necessary.
795    * They map all the public static int members on{@link java.lang.Character}
796    */
797   public static Map typeIds;
798 
799   /** The maximum int value used internally as a type i
800    */
801   public static int maxTypeId;
802 
803   /** Maps the internal type ids to the type name
804    */
805   public static String[] typeMnemonics;
806 
807   /** Maps from type names to type internal id
808    */
809   public static Map stringTypeIds;
810 
811   /**
812    * This property holds an URL to the file containing the rules for this tokeniser
813    *
814    */
815 
816   /**    */
817   static protected String defaultResourceName =
818                             "creole/tokeniser/DefaultTokeniser.rules";
819 
820   private String rulesResourceName;
821   private java.net.URL rulesURL;
822   private String encoding;
823   private transient Vector progressListeners;
824   //kalina: added this as method to minimise too many init() calls
825   protected transient Map newStates = new HashMap();
826 
827 
828   /** The static initialiser will inspect the class {@link java.lang.Character}
829     * using reflection to find all the public static members and will map them
830     * to ids starting from 0.
831     * After that it will build all the static data: {@link #typeIds}, {@link
832     * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds}
833     */
834   static{
835     Field[] characterClassFields;
836 
837     try{
838       characterClassFields = Class.forName("java.lang.Character").getFields();
839     }catch(ClassNotFoundException cnfe){
840       throw new LuckyException("Could not find the java.lang.Character class!");
841     }
842 
843     Collection staticFields = new LinkedList();
844     // JDK 1.4 introduced directionality constants that have the same values as
845     //character types; we need to skip those as well
846     for(int i = 0; i< characterClassFields.length; i++)
847       if(Modifier.isStatic(characterClassFields[i].getModifiers()) &&
848          characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1)
849         staticFields.add(characterClassFields[i]);
850 
851     typeIds = new HashMap();
852     maxTypeId = staticFields.size() -1;
853     typeMnemonics = new String[maxTypeId + 1];
854     stringTypeIds = new HashMap();
855 
856     Iterator staticFieldsIter = staticFields.iterator();
857     Field currentField;
858     int currentId = 0;
859     String fieldName;
860 
861     try {
862       while(staticFieldsIter.hasNext()){
863         currentField = (Field)staticFieldsIter.next();
864         if(currentField.getType().toString().equals("byte")){
865           fieldName = currentField.getName();
866           typeIds.put(new Integer(currentField.getInt(null)),
867                                     new Integer(currentId));
868           typeMnemonics[currentId] = fieldName;
869           stringTypeIds.put(fieldName, new Integer(currentId));
870           currentId++;
871         }
872       }
873     } catch(Exception e) {
874       throw new LuckyException(e.toString());
875     }
876 
877     ignoreTokens = new HashSet();
878     ignoreTokens.add(" ");
879     ignoreTokens.add("\t");
880     ignoreTokens.add("\f");
881   }
882 
883 } // class DefaultTokeniser