|
SimpleTokeniser |
|
1 /* 2 * DefaultTokeniser.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Valentin Tablan, 2000 12 * 13 * $Id: SimpleTokeniser.java,v 1.14 2003/03/10 13:10:54 valyt Exp $ 14 */ 15 16 package gate.creole.tokeniser; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 import java.lang.reflect.*; 22 23 import gate.*; 24 import gate.creole.*; 25 import gate.event.*; 26 import gate.util.*; 27 28 //import EDU.auburn.VGJ.graph.ParseError; 29 30 /** Implementation of a Unicode rule based tokeniser. 31 * The tokeniser gets its rules from a file an {@link java.io.InputStream 32 * InputStream} or a {@link java.io.Reader Reader} which should be sent to one 33 * of the constructors. 34 * The implementations is based on a finite state machine that is built based 35 * on the set of rules. 36 * A rule has two sides, the left hand side (LHS)and the right hand side (RHS) 37 * that are separated by the ">" character. The LHS represents a 38 * regular expression that will be matched against the input while the RHS 39 * describes a Gate2 annotation in terms of annotation type and attribute-value 40 * pairs. 41 * The matching is done using Unicode enumarated types as defined by the {@link 42 * java.lang.Character Character} class. At the time of writing this class the 43 * suported Unicode categories were: 44 * <ul> 45 * <li>UNASSIGNED 46 * <li>UPPERCASE_LETTER 47 * <li>LOWERCASE_LETTER 48 * <li>TITLECASE_LETTER 49 * <li>MODIFIER_LETTER 50 * <li>OTHER_LETTER 51 * <li>NON_SPACING_MARK 52 * <li>ENCLOSING_MARK 53 * <li>COMBINING_SPACING_MARK 54 * <li>DECIMAL_DIGIT_NUMBER 55 * <li>LETTER_NUMBER 56 * <li>OTHER_NUMBER 57 * <li>SPACE_SEPARATOR 58 * <li>LINE_SEPARATOR 59 * <li>PARAGRAPH_SEPARATOR 60 * <li>CONTROL 61 * <li>FORMAT 62 * <li>PRIVATE_USE 63 * <li>SURROGATE 64 * <li>DASH_PUNCTUATION 65 * <li>START_PUNCTUATION 66 * <li>END_PUNCTUATION 67 * <li>CONNECTOR_PUNCTUATION 68 * <li>OTHER_PUNCTUATION 69 * <li>MATH_SYMBOL 70 * <li>CURRENCY_SYMBOL 71 * <li>MODIFIER_SYMBOL 72 * <li>OTHER_SYMBOL 73 * </ul> 74 * The accepted operators for the LHS are "+", "*" and "|" having the usual 75 * interpretations of "1 to n occurences", "0 to n occurences" and 76 * "boolean OR". 77 * For instance this is a valid LHS: 78 * <br>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ 79 * <br>meaning an uppercase letter followed by one or more lowercase letters. 80 * 81 * The RHS describes an annotation that is to be created and inserted in the 82 * annotation set provided in case of a match. The new annotation will span the 83 * text that has been recognised. The RHS consists in the annotation type 84 * followed by pairs of attributes and associated values. 85 * E.g. for the LHS above a possible RHS can be:<br> 86 * Token;kind=upperInitial;<br> 87 * representing an annotation of type "Token" having one attribute 88 * named "kind" with the value "upperInitial"<br> 89 * The entire rule willbe:<br> 90 * <pre>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;</pre> 91 * <br> 92 * The tokeniser ignores all the empty lines or the ones that start with # or 93 * //. 94 * 95 */ 96 public class SimpleTokeniser extends AbstractLanguageAnalyser{ 97 public static final String 98 SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document"; 99 100 public static final String 101 SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName"; 102 103 public static final String 104 SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL"; 105 106 public static final String 107 SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding"; 108 109 /** Debug flag 110 */ 111 private static final boolean DEBUG = false; 112 113 /** 114 * Creates a tokeniser 115 */ 116 public SimpleTokeniser(){ 117 } 118 119 /** 120 * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building 121 * the finite state machine at the core of the tokeniser. 122 * 123 * @exception ResourceInstantiationException 124 */ 125 public Resource init() throws ResourceInstantiationException{ 126 Reader rulesReader; 127 try{ 128 if(rulesURL != null){ 129 rulesReader = new InputStreamReader(rulesURL.openStream(), encoding); 130 }else{ 131 //no init data, Scream! 132 throw new ResourceInstantiationException( 133 "No URL provided for the rules!"); 134 } 135 initialState = new FSMState(this); 136 BufferedReader bRulesReader = new BufferedReader(rulesReader); 137 String line = bRulesReader.readLine(); 138 ///String toParse = ""; 139 StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE); 140 141 while (line != null){ 142 if(line.endsWith("\\")){ 143 ///toParse += line.substring(0,line.length()-1); 144 toParse.append(line.substring(0,line.length()-1)); 145 }else{ 146 /*toParse += line; 147 parseRule(toParse); 148 toParse = ""; 149 */ 150 toParse.append(line); 151 parseRule(toParse.toString()); 152 toParse.delete(0,toParse.length()); 153 } 154 line = bRulesReader.readLine(); 155 } 156 eliminateVoidTransitions(); 157 }catch(java.io.IOException ioe){ 158 throw new ResourceInstantiationException(ioe); 159 }catch(TokeniserException te){ 160 throw new ResourceInstantiationException(te); 161 } 162 return this; 163 } 164 165 /** 166 * Prepares this Processing resource for a new run. 167 */ 168 public void reset(){ 169 document = null; 170 } 171 172 /** Parses one input line containing a tokeniser rule. 173 * This will create the necessary FSMState objects and the links 174 * between them. 175 * 176 * @param line the string containing the rule 177 */ 178 void parseRule(String line)throws TokeniserException{ 179 //ignore comments 180 if(line.startsWith("#")) return; 181 182 if(line.startsWith("//")) return; 183 184 StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true); 185 FSMState newState = new FSMState(this); 186 187 initialState.put(null, newState); 188 FSMState finalState = parseLHS(newState, st, LHStoRHS); 189 String rhs = ""; 190 191 if(st.hasMoreTokens()) rhs = st.nextToken("\f"); 192 193 if(rhs.length() > 0)finalState.setRhs(rhs); 194 } // parseRule 195 196 /** Parses a part or the entire LHS. 197 * 198 * @param startState a FSMState object representing the initial state for 199 * the small FSM that will recognise the (part of) the rule parsed by this 200 * method. 201 * @param st a {@link java.util.StringTokenizer StringTokenizer} that 202 * provides the input 203 * @param until the string that marks the end of the section to be 204 * recognised. This method will first be called by {@link 205 * #parseRule(String)} with " >" in order to parse the entire 206 * LHS. when necessary it will make itself another call to {@link #parseLHS 207 * parseLHS} to parse a region of the LHS (e.g. a 208 * "(",")" enclosed part. 209 */ 210 FSMState parseLHS(FSMState startState, StringTokenizer st, String until) 211 throws TokeniserException{ 212 213 FSMState currentState = startState; 214 boolean orFound = false; 215 List orList = new LinkedList(); 216 String token; 217 token = skipIgnoreTokens(st); 218 219 if(null == token) return currentState; 220 221 FSMState newState; 222 Integer typeId; 223 UnicodeType uType; 224 225 bigwhile: while(!token.equals(until)){ 226 if(token.equals("(")){//(..) 227 newState = parseLHS(currentState, st,")"); 228 } else if(token.equals("\"")){//"unicode_type" 229 String sType = parseQuotedString(st, "\""); 230 newState = new FSMState(this); 231 typeId = (Integer)stringTypeIds.get(sType); 232 233 if(null == typeId) 234 throw new InvalidRuleException("Invalid type: \"" + sType + "\""); 235 else uType = new UnicodeType(typeId.intValue()); 236 237 currentState.put(uType ,newState); 238 } else {// a type with no quotes 239 String sType = token; 240 newState = new FSMState(this); 241 typeId = (Integer)stringTypeIds.get(sType); 242 243 if(null == typeId) 244 throw new InvalidRuleException("Invalid type: \"" + sType + "\""); 245 else uType = new UnicodeType(typeId.intValue()); 246 247 currentState.put(uType ,newState); 248 } 249 //treat the operators 250 token = skipIgnoreTokens(st); 251 if(null == token) throw 252 new InvalidRuleException("Tokeniser rule ended too soon!"); 253 254 if(token.equals("|")) { 255 256 orFound = true; 257 orList.add(newState); 258 token = skipIgnoreTokens(st); 259 if(null == token) throw 260 new InvalidRuleException("Tokeniser rule ended too soon!"); 261 262 continue bigwhile; 263 } else if(orFound) {//done parsing the "|" 264 orFound = false; 265 orList.add(newState); 266 newState = new FSMState(this); 267 Iterator orListIter = orList.iterator(); 268 269 while(orListIter.hasNext()) 270 ((FSMState)orListIter.next()).put(null, newState); 271 orList.clear(); 272 } 273 274 if(token.equals("+")) { 275 276 newState.put(null,currentState); 277 currentState = newState; 278 newState = new FSMState(this); 279 currentState.put(null,newState); 280 token = skipIgnoreTokens(st); 281 282 if(null == token) throw 283 new InvalidRuleException("Tokeniser rule ended too soon!"); 284 } else if(token.equals("*")) { 285 286 currentState.put(null,newState); 287 newState.put(null,currentState); 288 currentState = newState; 289 newState = new FSMState(this); 290 currentState.put(null,newState); 291 token = skipIgnoreTokens(st); 292 293 if(null == token) throw 294 new InvalidRuleException("Tokeniser rule ended too soon!"); 295 } 296 currentState = newState; 297 } 298 return currentState; 299 } // parseLHS 300 301 /** Parses from the given string tokeniser until it finds a specific 302 * delimiter. 303 * One use for this method is to read everything until the first quote. 304 * 305 * @param st a {@link java.util.StringTokenizer StringTokenizer} that 306 * provides the input 307 * @param until a String representing the end delimiter. 308 */ 309 String parseQuotedString(StringTokenizer st, String until) 310 throws TokeniserException { 311 312 String token; 313 314 if(st.hasMoreElements()) token = st.nextToken(); 315 else return null; 316 317 ///String type = ""; 318 StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE); 319 320 while(!token.equals(until)){ 321 //type += token; 322 type.append(token); 323 if(st.hasMoreElements())token = st.nextToken(); 324 else throw new InvalidRuleException("Tokeniser rule ended too soon!"); 325 } 326 return type.toString(); 327 } // parseQuotedString 328 329 /** Skips the ignorable tokens from the input returning the first significant 330 * token. 331 * The ignorable tokens are defined by {@link #ignoreTokens a set} 332 */ 333 protected static String skipIgnoreTokens(StringTokenizer st){ 334 Iterator ignorables; 335 boolean ignorableFound = false; 336 String currentToken; 337 338 while(true){ 339 if(st.hasMoreTokens()){ 340 currentToken = st.nextToken(); 341 ignorables = ignoreTokens.iterator(); 342 ignorableFound = false; 343 344 while(!ignorableFound && ignorables.hasNext()){ 345 if(currentToken.equals((String)ignorables.next())) 346 ignorableFound = true; 347 } 348 349 if(!ignorableFound) return currentToken; 350 } else return null; 351 } 352 }//skipIgnoreTokens 353 354 /* Computes the lambda-closure (aka epsilon closure) of the given set of 355 * states, that is the set of states that are accessible from any of the 356 * states in the given set using only unrestricted transitions. 357 * @return a set containing all the states accessible from this state via 358 * transitions that bear no restrictions. 359 */ 360 /** 361 * Converts the finite state machine to a deterministic one. 362 * 363 * @param s 364 */ 365 private AbstractSet lambdaClosure(Set s){ 366 367 //the stack/queue used by the algorithm 368 LinkedList list = new LinkedList(s); 369 370 //the set to be returned 371 AbstractSet lambdaClosure = new HashSet(s); 372 373 FSMState top; 374 FSMState currentState; 375 Set nextStates; 376 Iterator statesIter; 377 378 while(!list.isEmpty()) { 379 top = (FSMState)list.removeFirst(); 380 nextStates = top.nextSet(null); 381 382 if(null != nextStates){ 383 statesIter = nextStates.iterator(); 384 385 while(statesIter.hasNext()) { 386 currentState = (FSMState)statesIter.next(); 387 if(!lambdaClosure.contains(currentState)){ 388 lambdaClosure.add(currentState); 389 list.addFirst(currentState); 390 }//if(!lambdaClosure.contains(currentState)) 391 }//while(statesIter.hasNext()) 392 393 }//if(null != nextStates) 394 } 395 return lambdaClosure; 396 } // lambdaClosure 397 398 /** Converts the FSM from a non-deterministic to a deterministic one by 399 * eliminating all the unrestricted transitions. 400 */ 401 void eliminateVoidTransitions() throws TokeniserException { 402 403 //kalina:clear() faster than init() which is called with init() 404 newStates.clear(); 405 Set sdStates = new HashSet(); 406 LinkedList unmarkedDStates = new LinkedList(); 407 DFSMState dCurrentState = new DFSMState(this); 408 Set sdCurrentState = new HashSet(); 409 410 sdCurrentState.add(initialState); 411 sdCurrentState = lambdaClosure(sdCurrentState); 412 newStates.put(sdCurrentState, dCurrentState); 413 sdStates.add(sdCurrentState); 414 415 //find out if the new state is a final one 416 Iterator innerStatesIter = sdCurrentState.iterator(); 417 String rhs; 418 FSMState currentInnerState; 419 Set rhsClashSet = new HashSet(); 420 boolean newRhs = false; 421 422 while(innerStatesIter.hasNext()){ 423 currentInnerState = (FSMState)innerStatesIter.next(); 424 if(currentInnerState.isFinal()){ 425 rhs = currentInnerState.getRhs(); 426 rhsClashSet.add(rhs); 427 dCurrentState.rhs = rhs; 428 newRhs = true; 429 } 430 } 431 432 if(rhsClashSet.size() > 1){ 433 Err.println("Warning, rule clash: " + rhsClashSet + 434 "\nSelected last definition: " + dCurrentState.rhs); 435 } 436 437 if(newRhs)dCurrentState.buildTokenDesc(); 438 rhsClashSet.clear(); 439 unmarkedDStates.addFirst(sdCurrentState); 440 dInitialState = dCurrentState; 441 Set nextSet; 442 443 while(!unmarkedDStates.isEmpty()){ 444 //Out.println("\n\n=====================" + unmarkedDStates.size()); 445 sdCurrentState = (Set)unmarkedDStates.removeFirst(); 446 for(int type = 0; type < maxTypeId; type++){ 447 //Out.print(type); 448 nextSet = new HashSet(); 449 innerStatesIter = sdCurrentState.iterator(); 450 451 while(innerStatesIter.hasNext()){ 452 currentInnerState = (FSMState)innerStatesIter.next(); 453 Set tempSet = currentInnerState.nextSet(type); 454 if(null != tempSet) nextSet.addAll(tempSet); 455 }//while(innerStatesIter.hasNext()) 456 457 if(!nextSet.isEmpty()){ 458 nextSet = lambdaClosure(nextSet); 459 dCurrentState = (DFSMState)newStates.get(nextSet); 460 461 if(dCurrentState == null){ 462 463 //we have a new DFSMState 464 dCurrentState = new DFSMState(this); 465 sdStates.add(nextSet); 466 unmarkedDStates.add(nextSet); 467 468 //check to see whether the new state is a final one 469 innerStatesIter = nextSet.iterator(); 470 newRhs =false; 471 472 while(innerStatesIter.hasNext()){ 473 currentInnerState = (FSMState)innerStatesIter.next(); 474 if(currentInnerState.isFinal()){ 475 rhs = currentInnerState.getRhs(); 476 rhsClashSet.add(rhs); 477 dCurrentState.rhs = rhs; 478 newRhs = true; 479 } 480 } 481 482 if(rhsClashSet.size() > 1){ 483 Err.println("Warning, rule clash: " + rhsClashSet + 484 "\nSelected last definition: " + dCurrentState.rhs); 485 } 486 487 if(newRhs)dCurrentState.buildTokenDesc(); 488 rhsClashSet.clear(); 489 newStates.put(nextSet, dCurrentState); 490 } 491 ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState); 492 } // if(!nextSet.isEmpty()) 493 494 } // for(byte type = 0; type < 256; type++) 495 496 } // while(!unmarkedDStates.isEmpty()) 497 498 } // eliminateVoidTransitions 499 500 /** Returns a string representation of the non-deterministic FSM graph using 501 * GML (Graph modelling language). 502 */ 503 public String getFSMgml(){ 504 String res = "graph[ \ndirected 1\n"; 505 ///String nodes = "", edges = ""; 506 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE), 507 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE); 508 509 Iterator fsmStatesIter = fsmStates.iterator(); 510 while (fsmStatesIter.hasNext()){ 511 FSMState currentState = (FSMState)fsmStatesIter.next(); 512 int stateIndex = currentState.getIndex(); 513 /*nodes += "node[ id " + stateIndex + 514 " label \"" + stateIndex; 515 */ 516 nodes.append("node[ id "); 517 nodes.append(stateIndex); 518 nodes.append(" label \""); 519 nodes.append(stateIndex); 520 521 if(currentState.isFinal()){ 522 ///nodes += ",F\\n" + currentState.getRhs(); 523 nodes.append(",F\\n" + currentState.getRhs()); 524 } 525 ///nodes += "\" ]\n"; 526 nodes.append("\" ]\n"); 527 ///edges += currentState.getEdgesGML(); 528 edges.append(currentState.getEdgesGML()); 529 } 530 res += nodes.toString() + edges.toString() + "]\n"; 531 return res; 532 } // getFSMgml 533 534 /** Returns a string representation of the deterministic FSM graph using 535 * GML. 536 */ 537 public String getDFSMgml() { 538 String res = "graph[ \ndirected 1\n"; 539 ///String nodes = "", edges = ""; 540 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE), 541 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE); 542 543 Iterator dfsmStatesIter = dfsmStates.iterator(); 544 while (dfsmStatesIter.hasNext()) { 545 DFSMState currentState = (DFSMState)dfsmStatesIter.next(); 546 int stateIndex = currentState.getIndex(); 547 /* nodes += "node[ id " + stateIndex + 548 " label \"" + stateIndex; 549 */ 550 nodes.append("node[ id "); 551 nodes.append(stateIndex); 552 nodes.append(" label \""); 553 nodes.append(stateIndex); 554 555 if(currentState.isFinal()){ 556 /// nodes += ",F\\n" + currentState.getRhs(); 557 nodes.append(",F\\n" + currentState.getRhs()); 558 } 559 /// nodes += "\" ]\n"; 560 nodes.append("\" ]\n"); 561 /// edges += currentState.getEdgesGML(); 562 edges.append(currentState.getEdgesGML()); 563 } 564 res += nodes.toString() + edges.toString() + "]\n"; 565 return res; 566 } // getDFSMgml 567 568 //no doc required: javadoc will copy it from the interface 569 /** */ 570 public FeatureMap getFeatures(){ 571 return features; 572 } // getFeatures 573 574 /** */ 575 public void setFeatures(FeatureMap features){ 576 this.features = features; 577 } // setFeatures 578 579 /** 580 * The method that does the actual tokenisation. 581 */ 582 public void execute() throws ExecutionException { 583 interrupted = false; 584 AnnotationSet annotationSet; 585 //check the input 586 if(document == null) { 587 throw new ExecutionException( 588 "No document to tokenise!" 589 ); 590 } 591 592 if(annotationSetName == null || 593 annotationSetName.equals("")) annotationSet = document.getAnnotations(); 594 else annotationSet = document.getAnnotations(annotationSetName); 595 596 fireStatusChanged( 597 "Tokenising " + document.getName() + "..."); 598 599 String content = document.getContent().toString(); 600 int length = content.length(); 601 char currentChar; 602 603 DFSMState graphPosition = dInitialState; 604 605 //the index of the first character of the token trying to be recognised 606 int tokenStart = 0; 607 608 //the index of the last character of the last token recognised 609 int lastMatch = -1; 610 611 DFSMState lastMatchingState = null; 612 DFSMState nextState; 613 String tokenString; 614 int charIdx = 0; 615 int oldCharIdx = 0; 616 FeatureMap newTokenFm; 617 618 while(charIdx < length){ 619 currentChar = content.charAt(charIdx); 620 // Out.println( 621 // currentChar + typesMnemonics[Character.getType(currentChar)+128]); 622 nextState = graphPosition.next(((Integer)typeIds.get( 623 new Integer(Character.getType(currentChar)))).intValue()); 624 625 if( null != nextState ) { 626 graphPosition = nextState; 627 if(graphPosition.isFinal()) { 628 lastMatch = charIdx; 629 lastMatchingState = graphPosition; 630 } 631 charIdx ++; 632 } else {//we have a match! 633 newTokenFm = Factory.newFeatureMap(); 634 635 if (null == lastMatchingState) { 636 tokenString = content.substring(tokenStart, tokenStart +1); 637 newTokenFm.put("type","UNKNOWN"); 638 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); 639 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, 640 Integer.toString(tokenString.length())); 641 642 try { 643 annotationSet.add(new Long(tokenStart), 644 new Long(tokenStart + 1), 645 "DEFAULT_TOKEN", newTokenFm); 646 } catch (InvalidOffsetException ioe) { 647 //This REALLY shouldn't happen! 648 ioe.printStackTrace(Err.getPrintWriter()); 649 } 650 // Out.println("Default token: " + tokenStart + 651 // "->" + tokenStart + " :" + tokenString + ";"); 652 charIdx = tokenStart + 1; 653 } else { 654 tokenString = content.substring(tokenStart, lastMatch + 1); 655 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); 656 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, 657 Integer.toString(tokenString.length())); 658 659 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){ 660 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], 661 lastMatchingState.getTokenDesc()[i][1]); 662 //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" + 663 // lastMatchingState.getTokenDesc()[i][1]); 664 } 665 666 667 try { 668 annotationSet.add(new Long(tokenStart), 669 new Long(lastMatch + 1), 670 lastMatchingState.getTokenDesc()[0][0], newTokenFm); 671 } catch(InvalidOffsetException ioe) { 672 //This REALLY shouldn't happen! 673 throw new GateRuntimeException(ioe.toString()); 674 } 675 676 // Out.println(lastMatchingState.getTokenDesc()[0][0] + 677 // ": " + tokenStart + "->" + lastMatch + 678 // " :" + tokenString + ";"); 679 charIdx = lastMatch + 1; 680 } 681 682 lastMatchingState = null; 683 graphPosition = dInitialState; 684 tokenStart = charIdx; 685 } 686 687 if((charIdx - oldCharIdx > 256)){ 688 fireProgressChanged((100 * charIdx )/ length ); 689 oldCharIdx = charIdx; 690 if(isInterrupted()) throw new ExecutionInterruptedException(); 691 } 692 693 } // while(charIdx < length) 694 695 if (null != lastMatchingState) { 696 tokenString = content.substring(tokenStart, lastMatch + 1); 697 newTokenFm = Factory.newFeatureMap(); 698 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); 699 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, 700 Integer.toString(tokenString.length())); 701 702 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){ 703 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], 704 lastMatchingState.getTokenDesc()[i][1]); 705 } 706 707 708 try { 709 annotationSet.add(new Long(tokenStart), 710 new Long(lastMatch + 1), 711 lastMatchingState.getTokenDesc()[0][0], newTokenFm); 712 } catch(InvalidOffsetException ioe) { 713 //This REALLY shouldn't happen! 714 throw new GateRuntimeException(ioe.toString()); 715 } 716 717 } 718 719 reset(); 720 fireProcessFinished(); 721 fireStatusChanged("Tokenisation complete!"); 722 } // run 723 724 /** 725 * Sets the value of the <code>rulesURL</code> property which holds an URL 726 * to the file containing the rules for this tokeniser. 727 * @param newRulesURL 728 */ 729 public void setRulesURL(java.net.URL newRulesURL) { 730 rulesURL = newRulesURL; 731 } 732 /** 733 * Gets the value of the <code>rulesURL</code> property hich holds an 734 * URL to the file containing the rules for this tokeniser. 735 */ 736 public java.net.URL getRulesURL() { 737 return rulesURL; 738 } 739 /** */ 740 public void setAnnotationSetName(String newAnnotationSetName) { 741 annotationSetName = newAnnotationSetName; 742 } 743 /** */ 744 public String getAnnotationSetName() { 745 return annotationSetName; 746 } 747 public void setRulesResourceName(String newRulesResourceName) { 748 rulesResourceName = newRulesResourceName; 749 } 750 public String getRulesResourceName() { 751 return rulesResourceName; 752 } 753 public void setEncoding(String newEncoding) { 754 encoding = newEncoding; 755 } 756 public String getEncoding() { 757 return encoding; 758 } 759 760 /** */ 761 protected FeatureMap features = null; 762 763 /** the annotations et where the new annotations will be adde 764 */ 765 protected String annotationSetName; 766 767 /** The initial state of the non deterministic machin 768 */ 769 protected FSMState initialState; 770 771 /** A set containng all the states of the non deterministic machin 772 */ 773 protected Set fsmStates = new HashSet(); 774 775 /** The initial state of the deterministic machin 776 */ 777 protected DFSMState dInitialState; 778 779 /** A set containng all the states of the deterministic machin 780 */ 781 protected Set dfsmStates = new HashSet(); 782 783 /** The separator from LHS to RH 784 */ 785 static String LHStoRHS = ">"; 786 787 /** A set of string representing tokens to be ignored (e.g. blanks 788 */ 789 static Set ignoreTokens; 790 791 /** maps from int (the static value on {@link java.lang.Character} to int 792 * the internal value used by the tokeniser. The ins values used by the 793 * tokeniser are consecutive values, starting from 0 and going as high as 794 * necessary. 795 * They map all the public static int members on{@link java.lang.Character} 796 */ 797 public static Map typeIds; 798 799 /** The maximum int value used internally as a type i 800 */ 801 public static int maxTypeId; 802 803 /** Maps the internal type ids to the type name 804 */ 805 public static String[] typeMnemonics; 806 807 /** Maps from type names to type internal id 808 */ 809 public static Map stringTypeIds; 810 811 /** 812 * This property holds an URL to the file containing the rules for this tokeniser 813 * 814 */ 815 816 /** */ 817 static protected String defaultResourceName = 818 "creole/tokeniser/DefaultTokeniser.rules"; 819 820 private String rulesResourceName; 821 private java.net.URL rulesURL; 822 private String encoding; 823 private transient Vector progressListeners; 824 //kalina: added this as method to minimise too many init() calls 825 protected transient Map newStates = new HashMap(); 826 827 828 /** The static initialiser will inspect the class {@link java.lang.Character} 829 * using reflection to find all the public static members and will map them 830 * to ids starting from 0. 831 * After that it will build all the static data: {@link #typeIds}, {@link 832 * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds} 833 */ 834 static{ 835 Field[] characterClassFields; 836 837 try{ 838 characterClassFields = Class.forName("java.lang.Character").getFields(); 839 }catch(ClassNotFoundException cnfe){ 840 throw new LuckyException("Could not find the java.lang.Character class!"); 841 } 842 843 Collection staticFields = new LinkedList(); 844 // JDK 1.4 introduced directionality constants that have the same values as 845 //character types; we need to skip those as well 846 for(int i = 0; i< characterClassFields.length; i++) 847 if(Modifier.isStatic(characterClassFields[i].getModifiers()) && 848 characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1) 849 staticFields.add(characterClassFields[i]); 850 851 typeIds = new HashMap(); 852 maxTypeId = staticFields.size() -1; 853 typeMnemonics = new String[maxTypeId + 1]; 854 stringTypeIds = new HashMap(); 855 856 Iterator staticFieldsIter = staticFields.iterator(); 857 Field currentField; 858 int currentId = 0; 859 String fieldName; 860 861 try { 862 while(staticFieldsIter.hasNext()){ 863 currentField = (Field)staticFieldsIter.next(); 864 if(currentField.getType().toString().equals("byte")){ 865 fieldName = currentField.getName(); 866 typeIds.put(new Integer(currentField.getInt(null)), 867 new Integer(currentId)); 868 typeMnemonics[currentId] = fieldName; 869 stringTypeIds.put(fieldName, new Integer(currentId)); 870 currentId++; 871 } 872 } 873 } catch(Exception e) { 874 throw new LuckyException(e.toString()); 875 } 876 877 ignoreTokens = new HashSet(); 878 ignoreTokens.add(" "); 879 ignoreTokens.add("\t"); 880 ignoreTokens.add("\f"); 881 } 882 883 } // class DefaultTokeniser
|
SimpleTokeniser |
|