|
DefaultGazetteer |
|
1 /* 2 * DefaultGazeteer.java 3 * 4 * Copyright (c) 2000-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June1991. 9 * 10 * A copy of this licence is included in the distribution in the file 11 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html. 12 * 13 * Valentin Tablan, 03/07/2000 14 * borislav popov 24/03/2002 15 * 16 * $Id: DefaultGazetteer.java,v 1.44 2003/07/16 14:36:59 valyt Exp $ 17 */ 18 package gate.creole.gazetteer; 19 20 import java.io.*; 21 import java.util.*; 22 import java.net.*; 23 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.event.*; 27 import gate.*; 28 29 /** This component is responsible for doing lists lookup. The implementaion is 30 * based on finite state machines. 31 * The phrases to be recognised should be listed in a set of files, one for 32 * each type of occurences. 33 * The gazeteer is build with the information from a file that contains the set 34 * of lists (which are files as well) and the associated type for each list. 35 * The file defining the set of lists should have the following syntax: 36 * each list definition should be written on its own line and should contain: 37 * <ol> 38 * <li>the file name (required) </li> 39 * <li>the major type (required) </li> 40 * <li>the minor type (optional)</li> 41 * <li>the language(s) (optional) </li> 42 * </ol> 43 * The elements of each definition are separated by ":". 44 * The following is an example of a valid definition: <br> 45 * <code>personmale.lst:person:male:english</code> 46 * Each list file named in the lists definition file is just a list containing 47 * one entry per line. 48 * When this gazetter will be run over some input text (a Gate document) it 49 * will generate annotations of type Lookup having the attributes specified in 50 * the definition file. 51 */ 52 public class DefaultGazetteer extends AbstractGazetteer { 53 54 /** Debug flag 55 */ 56 private static final boolean DEBUG = false; 57 58 public static final String 59 DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document"; 60 61 public static final String 62 DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName"; 63 64 public static final String 65 DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL"; 66 67 public static final String 68 DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding"; 69 70 public static final String 71 DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive"; 72 73 74 /** a map of nodes vs gaz lists */ 75 private Map listsByNode; 76 77 /** Build a gazetter using the default lists from the agte resources 78 * {@see init()} 79 */ 80 public DefaultGazetteer(){ 81 } 82 83 /** Does the actual loading and parsing of the lists. This method must be 84 * called before the gazetteer can be used 85 */ 86 public Resource init()throws ResourceInstantiationException{ 87 fsmStates = new HashSet(); 88 initialState = new FSMState(this); 89 if(listsURL == null){ 90 throw new ResourceInstantiationException ( 91 "No URL provided for gazetteer creation!"); 92 } 93 definition = new LinearDefinition(); 94 definition.setURL(listsURL); 95 definition.load(); 96 int linesCnt = definition.size(); 97 listsByNode = definition.loadLists(); 98 Iterator inodes = definition.iterator(); 99 100 String line; 101 int nodeIdx = 0; 102 LinearNode node; 103 while (inodes.hasNext()) { 104 node = (LinearNode) inodes.next(); 105 fireStatusChanged("Reading " + node.toString()); 106 fireProgressChanged(++nodeIdx * 100 / linesCnt); 107 readList(node,true); 108 } // while iline 109 fireProcessFinished(); 110 return this; 111 } 112 113 114 /** Reads one lists (one file) of phrases 115 * 116 * @param listDesc the line from the definition file 117 * @param add 118 * @add if <b>true</b> will add the phrases found in the list to the ones 119 * recognised by this gazetter, if <b>false</b> the phrases found in the 120 * list will be removed from the list of phrases recognised by this 121 * gazetteer. 122 */ 123 void readList(LinearNode node, boolean add) throws ResourceInstantiationException{ 124 String listName, majorType, minorType, languages; 125 if ( null == node ) { 126 throw new ResourceInstantiationException(" LinearNode node is null "); 127 } 128 129 listName = node.getList(); 130 majorType = node.getMajorType(); 131 minorType = node.getMinorType(); 132 languages = node.getLanguage(); 133 GazetteerList gazList = (GazetteerList)listsByNode.get(node); 134 if (null == gazList) { 135 throw new ResourceInstantiationException("gazetteer list not found by node"); 136 } 137 138 Iterator iline = gazList.iterator(); 139 140 Lookup lookup = new Lookup(listName,majorType, minorType, languages); 141 lookup.list = node.getList(); 142 if ( null != mappingDefinition){ 143 MappingNode mnode = mappingDefinition.getNodeByList(lookup.list); 144 if (null!=mnode){ 145 lookup.oClass = mnode.getClassID(); 146 lookup.ontology = mnode.getOntologyID(); 147 } 148 }//if mapping def 149 150 String line; 151 while(iline.hasNext()){ 152 line = iline.next().toString(); 153 if(add)addLookup(line, lookup); 154 else removeLookup(line, lookup); 155 } 156 } // void readList(String listDesc) 157 158 /** Adds one phrase to the list of phrases recognised by this gazetteer 159 * 160 * @param text the phrase to be added 161 * @param lookup the description of the annotation to be added when this 162 * phrase is recognised 163 */ 164 // >>> DAM, was 165 /* 166 public void addLookup(String text, Lookup lookup) { 167 Character currentChar; 168 FSMState currentState = initialState; 169 FSMState nextState; 170 Lookup oldLookup; 171 boolean isSpace; 172 173 for(int i = 0; i< text.length(); i++) { 174 isSpace = Character.isWhitespace(text.charAt(i)); 175 if(isSpace) currentChar = new Character(' '); 176 else currentChar = (caseSensitive.booleanValue()) ? 177 new Character(text.charAt(i)) : 178 new Character(Character.toUpperCase(text.charAt(i))) ; 179 nextState = currentState.next(currentChar); 180 if(nextState == null){ 181 nextState = new FSMState(this); 182 currentState.put(currentChar, nextState); 183 if(isSpace) nextState.put(new Character(' '),nextState); 184 } 185 currentState = nextState; 186 } //for(int i = 0; i< text.length(); i++) 187 188 currentState.addLookup(lookup); 189 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); 190 191 } // addLookup 192 */ 193 // >>> DAM: TransArray optimization 194 public void addLookup(String text, Lookup lookup) { 195 char currentChar; 196 FSMState currentState = initialState; 197 FSMState nextState; 198 Lookup oldLookup; 199 boolean isSpace; 200 201 for(int i = 0; i< text.length(); i++) { 202 currentChar = text.charAt(i); 203 isSpace = Character.isWhitespace(currentChar); 204 if(isSpace) currentChar = ' '; 205 else currentChar = (caseSensitive.booleanValue()) ? 206 currentChar : 207 Character.toUpperCase(currentChar) ; 208 nextState = currentState.next(currentChar); 209 if(nextState == null){ 210 nextState = new FSMState(this); 211 currentState.put(currentChar, nextState); 212 if(isSpace) nextState.put(' ',nextState); 213 } 214 currentState = nextState; 215 } //for(int i = 0; i< text.length(); i++) 216 217 currentState.addLookup(lookup); 218 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); 219 220 } // addLookup 221 // >>> DAM, end 222 223 /** Removes one phrase to the list of phrases recognised by this gazetteer 224 * 225 * @param text the phrase to be removed 226 * @param lookup the description of the annotation associated to this phrase 227 */ 228 // >>> DAM, was 229 /* 230 public void removeLookup(String text, Lookup lookup) { 231 Character currentChar; 232 FSMState currentState = initialState; 233 FSMState nextState; 234 Lookup oldLookup; 235 boolean isSpace; 236 237 for(int i = 0; i< text.length(); i++) { 238 isSpace = Character.isWhitespace(text.charAt(i)); 239 if(isSpace) currentChar = new Character(' '); 240 else currentChar = new Character(text.charAt(i)); 241 nextState = currentState.next(currentChar); 242 if(nextState == null) return;//nothing to remove 243 currentState = nextState; 244 } //for(int i = 0; i< text.length(); i++) 245 currentState.removeLookup(lookup); 246 } // removeLookup 247 */ 248 // >>> DAM: TransArray optimization 249 public void removeLookup(String text, Lookup lookup) { 250 char currentChar; 251 FSMState currentState = initialState; 252 FSMState nextState; 253 Lookup oldLookup; 254 255 for(int i = 0; i< text.length(); i++) { 256 currentChar = text.charAt(i); 257 if(Character.isWhitespace(currentChar)) currentChar = ' '; 258 nextState = currentState.next(currentChar); 259 if(nextState == null) return;//nothing to remove 260 currentState = nextState; 261 } //for(int i = 0; i< text.length(); i++) 262 currentState.removeLookup(lookup); 263 } // removeLookup 264 // >>> DAM, end 265 266 /** Returns a string representation of the deterministic FSM graph using 267 * GML. 268 */ 269 public String getFSMgml() { 270 String res = "graph[ \ndirected 1\n"; 271 ///String nodes = "", edges = ""; 272 StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE), 273 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE); 274 Iterator fsmStatesIter = fsmStates.iterator(); 275 while (fsmStatesIter.hasNext()){ 276 FSMState currentState = (FSMState)fsmStatesIter.next(); 277 int stateIndex = currentState.getIndex(); 278 /*nodes += "node[ id " + stateIndex + 279 " label \"" + stateIndex; 280 */ 281 nodes.append("node[ id "); 282 nodes.append(stateIndex); 283 nodes.append(" label \""); 284 nodes.append(stateIndex); 285 286 if(currentState.isFinal()){ 287 ///nodes += ",F\\n" + currentState.getLookupSet(); 288 nodes.append(",F\\n"); 289 nodes.append(currentState.getLookupSet()); 290 } 291 ///nodes += "\" ]\n"; 292 nodes.append("\" ]\n"); 293 //edges += currentState.getEdgesGML(); 294 edges.append(currentState.getEdgesGML()); 295 } 296 res += nodes.toString() + edges.toString() + "]\n"; 297 return res; 298 } // getFSMgml 299 300 301 /** 302 * Tests whether a character is internal to a word (i.e. if it's a letter or 303 * a combining mark (spacing or not)). 304 * @param ch the character to be tested 305 * @return a boolean value 306 */ 307 public static boolean isWordInternal(char ch){ 308 return Character.isLetter(ch) || 309 Character.getType(ch) == Character.COMBINING_SPACING_MARK || 310 Character.getType(ch) == Character.NON_SPACING_MARK; 311 } 312 313 /** 314 * This method runs the gazetteer. It assumes that all the needed parameters 315 * are set. If they are not, an exception will be fired. 316 */ 317 public void execute() throws ExecutionException{ 318 interrupted = false; 319 AnnotationSet annotationSet; 320 //check the input 321 if(document == null) { 322 throw new ExecutionException( 323 "No document to process!" 324 ); 325 } 326 327 if(annotationSetName == null || 328 annotationSetName.equals("")) annotationSet = document.getAnnotations(); 329 else annotationSet = document.getAnnotations(annotationSetName); 330 331 fireStatusChanged("Doing lookup in " + 332 document.getName() + "..."); 333 String content = document.getContent().toString(); 334 int length = content.length(); 335 // >>> DAM, was 336 /* 337 Character currentChar; 338 */ 339 // >>> DAM: TransArray optimization 340 char currentChar; 341 // >>> DAM, end 342 FSMState currentState = initialState; 343 FSMState nextState; 344 FSMState lastMatchingState = null; 345 int matchedRegionEnd = 0; 346 int matchedRegionStart = 0; 347 int charIdx = 0; 348 int oldCharIdx = 0; 349 FeatureMap fm; 350 Lookup currentLookup; 351 352 // >>> DAM, was 353 /* 354 while(charIdx < length) { 355 if(Character.isWhitespace(content.charAt(charIdx))) 356 currentChar = new Character(' '); 357 else currentChar = (caseSensitive.booleanValue()) ? 358 new Character(content.charAt(charIdx)) : 359 new Character(Character.toUpperCase( 360 content.charAt(charIdx))); 361 */ 362 // >>> DAM: TransArray optimization 363 while(charIdx < length) { 364 currentChar = content.charAt(charIdx); 365 if(Character.isWhitespace(currentChar)) currentChar = ' '; 366 else currentChar = caseSensitive.booleanValue() ? 367 currentChar : 368 Character.toUpperCase(currentChar); 369 // >>> DAM, end 370 nextState = currentState.next(currentChar); 371 if(nextState == null) { 372 //the matching stopped 373 374 //if we had a successful match then act on it; 375 if(lastMatchingState != null){ 376 //let's add the new annotation(s) 377 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 378 379 while(lookupIter.hasNext()) { 380 currentLookup = (Lookup)lookupIter.next(); 381 fm = Factory.newFeatureMap(); 382 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); 383 if (null!= currentLookup.oClass && null!=currentLookup.ontology){ 384 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass); 385 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology); 386 } 387 if(null != currentLookup.minorType) { 388 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); 389 if(null != currentLookup.languages) 390 fm.put("language", currentLookup.languages); 391 } 392 try { 393 annotationSet.add(new Long(matchedRegionStart), 394 new Long(matchedRegionEnd + 1), 395 LOOKUP_ANNOTATION_TYPE, 396 fm); 397 } catch(InvalidOffsetException ioe) { 398 throw new LuckyException(ioe.toString()); 399 } 400 }//while(lookupIter.hasNext()) 401 lastMatchingState = null; 402 } 403 404 //reset the FSM 405 charIdx = matchedRegionStart + 1; 406 matchedRegionStart = charIdx; 407 currentState = initialState; 408 409 } else{//go on with the matching 410 currentState = nextState; 411 //if we have a successful state then store it 412 if(currentState.isFinal() && 413 ( 414 (!wholeWordsOnly.booleanValue()) 415 || 416 ((matchedRegionStart == 0 || 417 !isWordInternal(content.charAt(matchedRegionStart - 1))) 418 && 419 (charIdx + 1 >= content.length() || 420 !isWordInternal(content.charAt(charIdx + 1))) 421 ) 422 ) 423 ){ 424 matchedRegionEnd = charIdx; 425 lastMatchingState = currentState; 426 } 427 charIdx ++; 428 if(charIdx == content.length()){ 429 //we can't go on, use the last matching state and restart matching 430 //from the next char 431 if(lastMatchingState != null){ 432 //let's add the new annotation(s) 433 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 434 435 while(lookupIter.hasNext()) { 436 currentLookup = (Lookup)lookupIter.next(); 437 fm = Factory.newFeatureMap(); 438 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); 439 if (null!= currentLookup.oClass && null!=currentLookup.ontology){ 440 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass); 441 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology); 442 } 443 if(null != currentLookup.minorType) { 444 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); 445 if(null != currentLookup.languages) 446 fm.put("language", currentLookup.languages); 447 } 448 try { 449 annotationSet.add(new Long(matchedRegionStart), 450 new Long(matchedRegionEnd + 1), 451 LOOKUP_ANNOTATION_TYPE, 452 fm); 453 } catch(InvalidOffsetException ioe) { 454 throw new LuckyException(ioe.toString()); 455 } 456 }//while(lookupIter.hasNext()) 457 lastMatchingState = null; 458 } 459 460 //reset the FSM 461 charIdx = matchedRegionStart + 1; 462 matchedRegionStart = charIdx; 463 currentState = initialState; 464 } 465 } 466 if(charIdx - oldCharIdx > 256) { 467 fireProgressChanged((100 * charIdx )/ length ); 468 oldCharIdx = charIdx; 469 if(isInterrupted()) throw new ExecutionInterruptedException( 470 "The execution of the " + getName() + 471 " gazetteer has been abruptly interrupted!"); 472 } 473 } // while(charIdx < length) 474 475 if(lastMatchingState != null) { 476 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 477 while(lookupIter.hasNext()) { 478 currentLookup = (Lookup)lookupIter.next(); 479 fm = Factory.newFeatureMap(); 480 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); 481 if (null!= currentLookup.oClass && null!=currentLookup.ontology){ 482 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass); 483 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology); 484 } 485 486 if(null != currentLookup.minorType) 487 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); 488 try{ 489 annotationSet.add(new Long(matchedRegionStart), 490 new Long(matchedRegionEnd + 1), 491 LOOKUP_ANNOTATION_TYPE, 492 fm); 493 } catch(InvalidOffsetException ioe) { 494 throw new GateRuntimeException(ioe.toString()); 495 } 496 }//while(lookupIter.hasNext()) 497 } 498 fireProcessFinished(); 499 fireStatusChanged("Lookup complete!"); 500 } // execute 501 502 503 /** The initial state of the FSM that backs this gazetteer 504 */ 505 FSMState initialState; 506 507 /** A set containing all the states of the FSM backing the gazetteer 508 */ 509 Set fsmStates; 510 511 /**lookup <br> 512 * @param singleItem a single string to be looked up by the gazetteer 513 * @return set of the Lookups associated with the parameter*/ 514 public Set lookup(String singleItem) { 515 char currentChar; 516 Set set = new HashSet(); 517 FSMState currentState = initialState; 518 FSMState nextState; 519 520 for(int i = 0; i< singleItem.length(); i++) { 521 currentChar = singleItem.charAt(i); 522 if(Character.isWhitespace(currentChar)) currentChar = ' '; 523 nextState = currentState.next(currentChar); 524 if(nextState == null) { 525 return set; 526 } 527 currentState = nextState; 528 } //for(int i = 0; i< text.length(); i++) 529 set = currentState.getLookupSet(); 530 return set; 531 } 532 533 public boolean remove(String singleItem) { 534 char currentChar; 535 FSMState currentState = initialState; 536 FSMState nextState; 537 Lookup oldLookup; 538 539 for(int i = 0; i< singleItem.length(); i++) { 540 currentChar = singleItem.charAt(i); 541 if(Character.isWhitespace(currentChar)) currentChar = ' '; 542 nextState = currentState.next(currentChar); 543 if(nextState == null) { 544 return false; 545 }//nothing to remove 546 currentState = nextState; 547 } //for(int i = 0; i< text.length(); i++) 548 currentState.lookupSet = new HashSet(); 549 return true; 550 } 551 552 public boolean add(String singleItem, Lookup lookup) { 553 addLookup(singleItem,lookup); 554 return true; 555 } 556 557 558 } // DefaultGazetteer 559 560 // >>> DAM: TransArray optimization, new charMap implementation 561 interface Iter 562 { 563 public boolean hasNext(); 564 public char next(); 565 } // iter class 566 567 /** 568 * class implementing the map using binary serach by char as key 569 * to retrive the coresponding object. 570 */ 571 class charMap 572 { 573 char[] itemsKeys = null; 574 Object[] itemsObjs = null; 575 576 /** 577 * resize the containers by one leavaing empty elemant at position 'index' 578 */ 579 void resize(int index) 580 { 581 int newsz = itemsKeys.length + 1; 582 char[] tempKeys = new char[newsz]; 583 Object[] tempObjs = new Object[newsz]; 584 int i; 585 for (i= 0; i < index; i++) 586 { 587 tempKeys[i] = itemsKeys[i]; 588 tempObjs[i] = itemsObjs[i]; 589 } 590 for (i= index+1; i < newsz; i++) 591 { 592 tempKeys[i] = itemsKeys[i-1]; 593 tempObjs[i] = itemsObjs[i-1]; 594 } 595 596 itemsKeys = tempKeys; 597 itemsObjs = tempObjs; 598 } // resize 599 600 /** 601 * get the object from the map using the char key 602 */ 603 Object get(char key) 604 { 605 if (itemsKeys == null) return null; 606 int index = Arrays.binarySearch(itemsKeys, key); 607 if (index<0) 608 return null; 609 return itemsObjs[index]; 610 } 611 /** 612 * put the object into the char map using the chat as the key 613 */ 614 Object put(char key, Object value) 615 { 616 if (itemsKeys == null) 617 { 618 itemsKeys = new char[1]; 619 itemsKeys[0] = key; 620 itemsObjs = new Object[1]; 621 itemsObjs[0] = value; 622 return value; 623 }// if first time 624 int index = Arrays.binarySearch(itemsKeys, key); 625 if (index<0) 626 { 627 index = ~index; 628 resize(index); 629 itemsKeys[index] = key; 630 itemsObjs[index] = value; 631 } 632 return itemsObjs[index]; 633 } // put 634 /** 635 * the keys itereator 636 * / 637 public Iter iter() 638 { 639 return new Iter() 640 { 641 int counter = 0; 642 public boolean hasNext() {return counter < itemsKeys.length;} 643 public char next() { return itemsKeys[counter];} 644 }; 645 } // iter() 646 */ 647 648 } // class charMap 649 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState
|
DefaultGazetteer |
|