|
HtmlDocumentHandler |
|
1 /* 2 * HtmlDocumentHandler.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 12/June/2000 12 * 13 * $Id: HtmlDocumentHandler.java,v 1.32 2002/05/17 08:42:28 nasso Exp $ 14 */ 15 16 package gate.html; 17 18 import javax.swing.text.html.*; 19 import javax.swing.text.html.parser.*; 20 import javax.swing.text.html.HTMLEditorKit.*; 21 import javax.swing.text.BadLocationException; 22 import javax.swing.text.MutableAttributeSet; 23 24 import java.util.*; 25 26 import gate.corpora.*; 27 import gate.util.*; 28 import gate.*; 29 import gate.event.*; 30 31 32 /** Implements the behaviour of the HTML reader. 33 * Methods of an object of this class are called by the HTML parser when 34 * events will appear. 35 * The idea is to parse the HTML document and construct Gate annotations 36 * objects. 37 * This class also will replace the content of the Gate document with a 38 * new one containing anly text from the HTML document. 39 */ 40 public class HtmlDocumentHandler extends ParserCallback { 41 42 /** Debug flag */ 43 private static final boolean DEBUG = false; 44 45 /** Constructor initialises all the private memeber data. 46 * This will use the default annotation set taken from the gate document. 47 * @param aDocument The gate document that will be processed 48 * @param aMarkupElementsMap The map containing the elements that will 49 * transform into annotations 50 */ 51 public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) { 52 this(aDocument,aMarkupElementsMap,null); 53 } 54 55 /** Constructor initialises all the private memeber data 56 * @param aDocument The gate document that will be processed 57 * @param aMarkupElementsMap The map containing the elements that will 58 * transform into annotations 59 * @param anAnnoatationSet The annotation set that will contain annotations 60 * resulted from the processing of the gate document 61 */ 62 public HtmlDocumentHandler(gate.Document aDocument, 63 Map aMarkupElementsMap, 64 gate.AnnotationSet anAnnotationSet) { 65 // init stack 66 stack = new java.util.Stack(); 67 68 // this string contains the plain text (the text without markup) 69 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); 70 71 // colector is used later to transform all custom objects into 72 // annotation objects 73 colector = new LinkedList(); 74 75 // the Gate document 76 doc = aDocument; 77 78 // this map contains the elements name that we want to create 79 // if it's null all the elements from the XML documents will be transformed 80 // into Gate annotation objects 81 markupElementsMap = aMarkupElementsMap; 82 83 // init an annotation set for this gate document 84 basicAS = anAnnotationSet; 85 86 customObjectsId = 0; 87 }//HtmlDocumentHandler 88 89 /** Keep the refference to this structure */ 90 private RepositioningInfo reposInfo = null; 91 92 /** Keep the refference to this structure */ 93 private RepositioningInfo ampCodingInfo = null; 94 95 /** Set repositioning information structure refference. If you set this 96 * refference to <B>null</B> information wouldn't be collected. 97 */ 98 public void setRepositioningInfo(RepositioningInfo info) { 99 reposInfo = info; 100 } // setRepositioningInfo 101 102 /** Return current RepositioningInfo object */ 103 public RepositioningInfo getRepositioningInfo() { 104 return reposInfo; 105 } // getRepositioningInfo 106 107 /** Set repositioning information structure refference for ampersand coding. 108 * If you set this refference to <B>null</B> information wouldn't be used. 109 */ 110 public void setAmpCodingInfo(RepositioningInfo info) { 111 ampCodingInfo = info; 112 } // setRepositioningInfo 113 114 /** Return current RepositioningInfo object for ampersand coding. */ 115 public RepositioningInfo getAmpCodingInfo() { 116 return ampCodingInfo; 117 } // getRepositioningInfo 118 119 /** The text inside the STYLE tag is processed with <code>handleText()</code>. 120 * We should skip inserting of this text in the document. */ 121 private boolean isInsideStyleTag = false; 122 123 /** This method is called when the HTML parser encounts the beginning 124 * of a tag that means that the tag is paired by an end tag and it's 125 * not an empty one. 126 */ 127 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { 128 // Fire the status listener if the elements processed exceded the rate 129 if (0 == (++elements % ELEMENTS_RATE)) 130 fireStatusChangedEvent("Processed elements : " + elements); 131 132 // Start of STYLE tag 133 if(HTML.Tag.STYLE.equals(t)) { 134 isInsideStyleTag = true; 135 } // if 136 137 // Construct a feature map from the attributes list 138 FeatureMap fm = Factory.newFeatureMap(); 139 140 // Take all the attributes an put them into the feature map 141 if (0 != a.getAttributeCount()){ 142 Enumeration enum = a.getAttributeNames(); 143 while (enum.hasMoreElements()){ 144 Object attribute = enum.nextElement(); 145 fm.put(attribute.toString(),(a.getAttribute(attribute)).toString()); 146 }// while 147 }// if 148 149 // Just analize the tag t and add some\n chars and spaces to the 150 // tmpDocContent.The reason behind is that we need to have a readable form 151 // for the final document. 152 customizeAppearanceOfDocumentWithStartTag(t); 153 154 // If until here the "tmpDocContent" ends with a NON whitespace char, 155 // then we add a space char before calculating the START index of this 156 // tag. 157 // This is done in order not to concatenate the content of two separate tags 158 // and obtain a different NEW word. 159 int tmpDocContentSize = tmpDocContent.length(); 160 if ( tmpDocContentSize != 0 && 161 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1)) 162 ) tmpDocContent.append(" "); 163 164 // create the start index of the annotation 165 Long startIndex = new Long(tmpDocContent.length()); 166 167 // initialy the start index is equal with the End index 168 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex); 169 170 // put it into the stack 171 stack.push (obj); 172 173 }//handleStartTag 174 175 /** This method is called when the HTML parser encounts the end of a tag 176 * that means that the tag is paired by a beginning tag 177 */ 178 public void handleEndTag(HTML.Tag t, int pos){ 179 // obj is for internal use 180 CustomObject obj = null; 181 182 // end of STYLE tag 183 if(HTML.Tag.STYLE.equals(t)) { 184 isInsideStyleTag = false; 185 } // if 186 187 // If the stack is not empty then we get the object from the stack 188 if (!stack.isEmpty()){ 189 obj = (CustomObject) stack.pop(); 190 // Before adding it to the colector, we need to check if is an 191 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. 192 if (obj.getStart().equals(obj.getEnd())){ 193 // The element had an end tag and its start was equal to its end. Hence 194 // it is anEmptyAndSpan one. 195 obj.getFM().put("isEmptyAndSpan","true"); 196 }// End iff 197 // we add it to the colector 198 colector.add(obj); 199 }// End if 200 201 // If element has text between, then customize its apearance 202 if ( obj != null && 203 obj.getStart().longValue() != obj.getEnd().longValue() 204 ) 205 // Customize the appearance of the document 206 customizeAppearanceOfDocumentWithEndTag(t); 207 208 // if t is the </HTML> tag then we reached the end of theHTMLdocument 209 if (t == HTML.Tag.HTML){ 210 // replace the old content with the new one 211 doc.setContent (new DocumentContentImpl(tmpDocContent.toString())); 212 213 // If basicAs is null then get the default annotation 214 // set from this gate document 215 if (basicAS == null) 216 basicAS = doc.getAnnotations( 217 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 218 219 // sort colector ascending on its id 220 Collections.sort(colector); 221 // iterate through colector and construct annotations 222 while (!colector.isEmpty()){ 223 obj = (CustomObject) colector.getFirst(); 224 colector.remove(obj); 225 // Construct an annotation from this obj 226 try{ 227 if (markupElementsMap == null){ 228 basicAS.add( obj.getStart(), 229 obj.getEnd(), 230 obj.getElemName(), 231 obj.getFM() 232 ); 233 }else{ 234 String annotationType = 235 (String) markupElementsMap.get(obj.getElemName()); 236 if (annotationType != null) 237 basicAS.add( obj.getStart(), 238 obj.getEnd(), 239 annotationType, 240 obj.getFM() 241 ); 242 } 243 }catch (InvalidOffsetException e){ 244 Err.prln("Error creating an annot :" + obj + " Discarded..."); 245 }// end try 246 // }// end if 247 }//while 248 249 // notify the listener about the total amount of elements that 250 // has been processed 251 fireStatusChangedEvent("Total elements : " + elements); 252 253 }//else 254 255 }//handleEndTag 256 257 /** This method is called when the HTML parser encounts an empty tag 258 */ 259 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){ 260 // fire the status listener if the elements processed exceded the rate 261 if ((++elements % ELEMENTS_RATE) == 0) 262 fireStatusChangedEvent("Processed elements : " + elements); 263 264 // construct a feature map from the attributes list 265 // these are empty elements 266 FeatureMap fm = Factory.newFeatureMap(); 267 268 // take all the attributes an put them into the feature map 269 if (0 != a.getAttributeCount ()){ 270 271 // Out.println("HAS attributes = " + a.getAttributeCount ()); 272 Enumeration enum = a.getAttributeNames (); 273 while (enum.hasMoreElements ()){ 274 Object attribute = enum.nextElement (); 275 fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString()); 276 277 }//while 278 279 }//if 280 281 // create the start index of the annotation 282 Long startIndex = new Long(tmpDocContent.length()); 283 284 // initialy the start index is equal with the End index 285 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex); 286 287 // we add the object directly into the colector 288 // we don't add it to the stack because this is an empty tag 289 colector.add(obj); 290 291 // Just analize the tag t and add some\n chars and spaces to the 292 // tmpDocContent.The reason behind is that we need to have a readable form 293 // for the final document. 294 customizeAppearanceOfDocumentWithSimpleTag(t); 295 296 } // handleSimpleTag 297 298 /** This method is called when the HTML parser encounts text (PCDATA) 299 */ 300 public void handleText(char[] text, int pos){ 301 302 // Skip the STYLE tag content 303 if(isInsideStyleTag) return; 304 305 // create a string object based on the reported text 306 String content = new String(text); 307 308 // remove the difference between JDK 1.3 and JDK 1.4 309 String trimContent = content.trim(); 310 if(trimContent.length() == 0) { 311 return; 312 } // if 313 314 int trimCorrection = content.indexOf(trimContent.charAt(0)); 315 content = trimContent; 316 317 StringBuffer contentBuffer = new StringBuffer(""); 318 int tmpDocContentSize = tmpDocContent.length(); 319 boolean incrementStartIndex = false; 320 // If the first char of the text just read "text[0]" is NOT whitespace AND 321 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then 322 // concatenation "tmpDocContent + content" will result into a new different 323 // word... and we want to avoid that... 324 if ( tmpDocContentSize != 0 && 325 content.length() != 0 && 326 !Character.isWhitespace(content.charAt(0)) && 327 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){ 328 329 contentBuffer.append(" "); 330 incrementStartIndex = true; 331 }// End if 332 // update the document content 333 334 // put the repositioning information 335 if(reposInfo != null) { 336 int extractedPos = tmpDocContent.length() + contentBuffer.length(); 337 addRepositioningInfo(content, pos + trimCorrection, extractedPos); 338 } // if 339 340 contentBuffer.append(content); 341 // calculate the End index for all the elements of the stack 342 // the expression is : End index = Current doc length + text length 343 Long end = new Long(tmpDocContent.length() + contentBuffer.length()); 344 345 CustomObject obj = null; 346 // Iterate through stack to modify the End index of the existing elements 347 348 java.util.Iterator anIterator = stack.iterator(); 349 while (anIterator.hasNext ()){ 350 // get the object and move to the next one 351 obj = (CustomObject) anIterator.next (); 352 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){ 353 obj.setStart(new Long(obj.getStart().longValue() + 1)); 354 }// End if 355 // sets its End index 356 obj.setEnd(end); 357 }// End while 358 359 tmpDocContent.append(contentBuffer.toString()); 360 }// end handleText(); 361 362 /** For given content the list with shrink position information is searched 363 * and on the corresponding positions the correct repositioning information 364 * is calculated and generated. 365 */ 366 public void addRepositioningInfo(String content, int pos, int extractedPos) { 367 int contentLength = content.length(); 368 369 // wrong way (without correction and analysing) 370 //reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength); 371 372 RepositioningInfo.PositionInfo pi = null; 373 long startPos = pos; 374 long correction = 0; 375 long substituteStart; 376 long remainingLen; 377 long offsetInExtracted; 378 379 for(int i = 0; i < ampCodingInfo.size(); ++i) { 380 pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i); 381 substituteStart = pi.getOriginalPosition(); 382 383 if(substituteStart >= startPos) { 384 if(substituteStart > pos + contentLength + correction) { 385 break; // outside the current text 386 } // if 387 388 // should create two repositioning information records 389 remainingLen = substituteStart - (startPos + correction); 390 offsetInExtracted = startPos - pos; 391 if(remainingLen > 0) { 392 reposInfo.addPositionInfo(startPos + correction, remainingLen, 393 extractedPos + offsetInExtracted, remainingLen); 394 } // if 395 // record for shrank text 396 reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(), 397 extractedPos + offsetInExtracted + remainingLen, 398 pi.getCurrentLength()); 399 startPos = startPos + remainingLen + pi.getCurrentLength(); 400 correction += pi.getOriginalLength() - pi.getCurrentLength(); 401 } // if 402 } // for 403 404 // there is some text remaining for repositioning 405 offsetInExtracted = startPos - pos; 406 remainingLen = contentLength - offsetInExtracted; 407 if(remainingLen > 0) { 408 reposInfo.addPositionInfo(startPos + correction, remainingLen, 409 extractedPos + offsetInExtracted, remainingLen); 410 } // if 411 } // addRepositioningInfo 412 413 /** This method analizes the tag t and adds some \n chars and spaces to the 414 * tmpDocContent.The reason behind is that we need to have a readable form 415 * for the final document. This method modifies the content of tmpDocContent. 416 * @param t the Html tag encounted by the HTML parser 417 */ 418 protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){ 419 boolean modification = false; 420 // if the HTML tag is BR then we add a new line character to the document 421 if (HTML.Tag.BR == t){ 422 tmpDocContent.append("\n"); 423 modification = true; 424 }// End if 425 if (modification == true){ 426 Long end = new Long (tmpDocContent.length()); 427 java.util.Iterator anIterator = stack.iterator(); 428 while (anIterator.hasNext ()){ 429 // get the object and move to the next one 430 CustomObject obj = (CustomObject) anIterator.next(); 431 // sets its End index 432 obj.setEnd(end); 433 }// End while 434 }//End if 435 }// customizeAppearanceOfDocumentWithSimpleTag 436 437 /** This method analizes the tag t and adds some \n chars and spaces to the 438 * tmpDocContent.The reason behind is that we need to have a readable form 439 * for the final document. This method modifies the content of tmpDocContent. 440 * @param t the Html tag encounted by the HTML parser 441 */ 442 protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){ 443 boolean modification = false; 444 if (HTML.Tag.P == t){ 445 int tmpDocContentSize = tmpDocContent.length(); 446 if ( tmpDocContentSize >= 2 && 447 '\n' != tmpDocContent.charAt(tmpDocContentSize - 2) 448 ) { tmpDocContent.append("\n"); modification = true;} 449 }// End if 450 if (modification == true){ 451 Long end = new Long (tmpDocContent.length()); 452 java.util.Iterator anIterator = stack.iterator(); 453 while (anIterator.hasNext ()){ 454 // get the object and move to the next one 455 CustomObject obj = (CustomObject) anIterator.next(); 456 // sets its End index 457 obj.setEnd(end); 458 }// End while 459 }//End if 460 }// customizeAppearanceOfDocumentWithStartTag 461 462 /** This method analizes the tag t and adds some \n chars and spaces to the 463 * tmpDocContent.The reason behind is that we need to have a readable form 464 * for the final document. This method modifies the content of tmpDocContent. 465 * @param t the Html tag encounted by the HTML parser 466 */ 467 protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){ 468 boolean modification = false; 469 // if the HTML tag is BR then we add a new line character to the document 470 if ( (HTML.Tag.P == t) || 471 472 (HTML.Tag.H1 == t) || 473 (HTML.Tag.H2 == t) || 474 (HTML.Tag.H3 == t) || 475 (HTML.Tag.H4 == t) || 476 (HTML.Tag.H5 == t) || 477 (HTML.Tag.H6 == t) || 478 (HTML.Tag.TR == t) || 479 (HTML.Tag.CENTER == t) || 480 (HTML.Tag.LI == t) 481 ){ tmpDocContent.append("\n"); modification = true;} 482 483 if (HTML.Tag.TITLE == t){ 484 tmpDocContent.append("\n\n"); 485 modification = true; 486 }// End if 487 488 if (modification == true){ 489 Long end = new Long (tmpDocContent.length()); 490 java.util.Iterator anIterator = stack.iterator(); 491 while (anIterator.hasNext ()){ 492 // get the object and move to the next one 493 CustomObject obj = (CustomObject) anIterator.next(); 494 // sets its End index 495 obj.setEnd(end); 496 }// End while 497 }//End if 498 }// customizeAppearanceOfDocumentWithEndTag 499 500 /** 501 * This method is called when the HTML parser encounts an error 502 * it depends on the programmer if he wants to deal with that error 503 */ 504 public void handleError(String errorMsg, int pos) { 505 //Out.println ("ERROR CALLED : " + errorMsg); 506 } 507 508 /** This method is called once, when the HTML parser reaches the end 509 * of its input streamin order to notify the parserCallback that there 510 * is nothing more to parse. 511 */ 512 public void flush() throws BadLocationException{ 513 }// flush 514 515 /** This method is called when the HTML parser encounts a comment 516 */ 517 public void handleComment(char[] text, int pos) { 518 } 519 520 //StatusReporter Implementation 521 522 public void addStatusListener(StatusListener listener) { 523 myStatusListeners.add(listener); 524 } 525 526 public void removeStatusListener(StatusListener listener) { 527 myStatusListeners.remove(listener); 528 } 529 530 protected void fireStatusChangedEvent(String text) { 531 Iterator listenersIter = myStatusListeners.iterator(); 532 while(listenersIter.hasNext()) 533 ((StatusListener)listenersIter.next()).statusChanged(text); 534 } 535 536 /** 537 * This method verifies if data contained by the CustomObject can be used 538 * to create a GATE annotation. 539 */ 540 /* private boolean canCreateAnnotation(CustomObject aCustomObject){ 541 long start = aCustomObject.getStart().longValue(); 542 long end = aCustomObject.getEnd().longValue(); 543 long gateDocumentSize = doc.getContent().size().longValue(); 544 545 if (start < 0 || end < 0 ) return false; 546 if (start > end ) return false; 547 if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false; 548 return true; 549 }// canCreateAnnotation 550 */ 551 552 // HtmlDocumentHandler member data 553 554 // this constant indicates when to fire the status listener 555 // this listener will add an overhead and we don't want a big overhead 556 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE 557 final static int ELEMENTS_RATE = 128; 558 559 // this map contains the elements name that we want to create 560 // if it's null all the elements from the HTML documents will be transformed 561 // into Gate annotation objects otherwise only the elements it contains will 562 // be transformed 563 private Map markupElementsMap = null; 564 565 // the content of the HTML document, without any tag 566 // for internal use 567 private StringBuffer tmpDocContent = null; 568 569 // a stack used to remember elements and to keep the order 570 private java.util.Stack stack = null; 571 572 // a gate document 573 private gate.Document doc = null; 574 575 // an annotation set used for creating annotation reffering the doc 576 private gate.AnnotationSet basicAS; 577 578 // listeners for status report 579 protected List myStatusListeners = new LinkedList(); 580 581 // this reports the the number of elements that have beed processed so far 582 private int elements = 0; 583 584 protected long customObjectsId = 0; 585 // we need a colection to retain all the CustomObjects that will be 586 // transformed into annotation over the gate document... 587 // the transformation will take place inside onDocumentEnd() method 588 private LinkedList colector = null; 589 590 // Inner class 591 /** 592 * The objects belonging to this class are used inside the stack. 593 * This class is for internal needs 594 */ 595 class CustomObject implements Comparable { 596 597 // constructor 598 public CustomObject(String anElemName, FeatureMap aFm, 599 Long aStart, Long anEnd) { 600 elemName = anElemName; 601 fm = aFm; 602 start = aStart; 603 end = anEnd; 604 id = new Long(customObjectsId ++); 605 }// End CustomObject() 606 607 // Methos implemented as required by Comparable interface 608 public int compareTo(Object o){ 609 CustomObject obj = (CustomObject) o; 610 return this.id.compareTo(obj.getId()); 611 }// compareTo(); 612 613 // accesor 614 public String getElemName() { 615 return elemName; 616 }// getElemName() 617 618 public FeatureMap getFM() { 619 return fm; 620 }// getFM() 621 622 public Long getStart() { 623 return start; 624 }// getStart() 625 626 public Long getEnd() { 627 return end; 628 }// getEnd() 629 630 public Long getId(){ return id;} 631 632 // mutator 633 public void setElemName(String anElemName) { 634 elemName = anElemName; 635 }// getElemName() 636 637 public void setFM(FeatureMap aFm) { 638 fm = aFm; 639 }// setFM(); 640 641 public void setStart(Long aStart) { 642 start = aStart; 643 }// setStart(); 644 645 public void setEnd(Long anEnd) { 646 end = anEnd; 647 }// setEnd(); 648 649 // data fields 650 private String elemName = null; 651 private FeatureMap fm = null; 652 private Long start = null; 653 private Long end = null; 654 private Long id = null; 655 656 } // End inner class CustomObject 657 658 }//End class HtmlDocumentHandler 659 660 661 662
|
HtmlDocumentHandler |
|