1   /*
2    *  HtmlDocumentHandler.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  12/June/2000
12   *
13   *  $Id: HtmlDocumentHandler.java,v 1.32 2002/05/17 08:42:28 nasso Exp $
14   */
15  
16  package gate.html;
17  
18  import javax.swing.text.html.*;
19  import javax.swing.text.html.parser.*;
20  import javax.swing.text.html.HTMLEditorKit.*;
21  import javax.swing.text.BadLocationException;
22  import javax.swing.text.MutableAttributeSet;
23  
24  import java.util.*;
25  
26  import gate.corpora.*;
27  import gate.util.*;
28  import gate.*;
29  import gate.event.*;
30  
31  
32  /** Implements the behaviour of the HTML reader.
33    * Methods of an object of this class are called by the HTML parser when
34    * events will appear.
35    * The idea is to parse the HTML document and construct Gate annotations
36    * objects.
37    * This class also will replace the content of the Gate document with a
38    * new one containing anly text from the HTML document.
39    */
40  public class HtmlDocumentHandler extends ParserCallback {
41  
42    /** Debug flag */
43    private static final boolean DEBUG = false;
44  
45    /** Constructor initialises all the private memeber data.
46      * This will use the default annotation set taken from the gate document.
47      * @param aDocument The gate document that will be processed
48      * @param aMarkupElementsMap The map containing the elements that will
49      * transform into annotations
50      */
51    public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) {
52      this(aDocument,aMarkupElementsMap,null);
53    }
54  
55    /** Constructor initialises all the private memeber data
56      * @param aDocument The gate document that will be processed
57      * @param aMarkupElementsMap The map containing the elements that will
58      * transform into annotations
59      * @param anAnnoatationSet The annotation set that will contain annotations
60      * resulted from the processing of the gate document
61      */
62    public HtmlDocumentHandler(gate.Document       aDocument,
63                               Map                 aMarkupElementsMap,
64                               gate.AnnotationSet  anAnnotationSet) {
65      // init stack
66      stack = new java.util.Stack();
67  
68      // this string contains the plain text (the text without markup)
69      tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
70  
71      // colector is used later to transform all custom objects into
72      // annotation objects
73      colector = new LinkedList();
74  
75      // the Gate document
76      doc = aDocument;
77  
78      // this map contains the elements name that we want to create
79      // if it's null all the elements from the XML documents will be transformed
80      // into Gate annotation objects
81      markupElementsMap = aMarkupElementsMap;
82  
83      // init an annotation set for this gate document
84      basicAS = anAnnotationSet;
85  
86      customObjectsId = 0;
87    }//HtmlDocumentHandler
88  
89    /** Keep the refference to this structure */
90    private RepositioningInfo reposInfo = null;
91  
92    /** Keep the refference to this structure */
93    private RepositioningInfo ampCodingInfo = null;
94  
95    /** Set repositioning information structure refference. If you set this
96     *  refference to <B>null</B> information wouldn't be collected.
97     */
98    public void setRepositioningInfo(RepositioningInfo info) {
99      reposInfo = info;
100   } // setRepositioningInfo
101 
102   /** Return current RepositioningInfo object */
103   public RepositioningInfo getRepositioningInfo() {
104     return reposInfo;
105   } // getRepositioningInfo
106 
107   /** Set repositioning information structure refference for ampersand coding.
108    *  If you set this refference to <B>null</B> information wouldn't be used.
109    */
110   public void setAmpCodingInfo(RepositioningInfo info) {
111     ampCodingInfo = info;
112   } // setRepositioningInfo
113 
114   /** Return current RepositioningInfo object for ampersand coding. */
115   public RepositioningInfo getAmpCodingInfo() {
116     return ampCodingInfo;
117   } // getRepositioningInfo
118 
119   /** The text inside the STYLE tag is processed with <code>handleText()</code>.
120    *  We should skip inserting of this text in the document. */
121   private boolean isInsideStyleTag = false;
122 
123   /** This method is called when the HTML parser encounts the beginning
124     * of a tag that means that the tag is paired by an end tag and it's
125     * not an empty one.
126     */
127   public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
128     // Fire the status listener if the elements processed exceded the rate
129     if (0 == (++elements % ELEMENTS_RATE))
130       fireStatusChangedEvent("Processed elements : " + elements);
131 
132     // Start of STYLE tag
133     if(HTML.Tag.STYLE.equals(t)) {
134       isInsideStyleTag = true;
135     } // if
136 
137     // Construct a feature map from the attributes list
138     FeatureMap fm = Factory.newFeatureMap();
139 
140     // Take all the attributes an put them into the feature map
141     if (0 != a.getAttributeCount()){
142       Enumeration enum = a.getAttributeNames();
143       while (enum.hasMoreElements()){
144         Object attribute = enum.nextElement();
145         fm.put(attribute.toString(),(a.getAttribute(attribute)).toString());
146       }// while
147     }// if
148 
149     // Just analize the tag t and add some\n chars and spaces to the
150     // tmpDocContent.The reason behind is that we need to have a readable form
151     // for the final document.
152     customizeAppearanceOfDocumentWithStartTag(t);
153 
154     // If until here the "tmpDocContent" ends with a NON whitespace char,
155     // then we add a space char before calculating the START index of this
156     // tag.
157     // This is done in order not to concatenate the content of two separate tags
158     // and obtain a different NEW word.
159     int tmpDocContentSize = tmpDocContent.length();
160     if ( tmpDocContentSize != 0 &&
161          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))
162        ) tmpDocContent.append(" ");
163 
164     // create the start index of the annotation
165     Long startIndex = new Long(tmpDocContent.length());
166 
167     // initialy the start index is equal with the End index
168     CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
169 
170     // put it into the stack
171     stack.push (obj);
172 
173   }//handleStartTag
174 
175    /** This method is called when the HTML parser encounts the end of a tag
176      * that means that the tag is paired by a beginning tag
177      */
178   public void handleEndTag(HTML.Tag t, int pos){
179     // obj is for internal use
180     CustomObject obj = null;
181 
182     // end of STYLE tag
183     if(HTML.Tag.STYLE.equals(t)) {
184       isInsideStyleTag = false;
185     } // if
186 
187     // If the stack is not empty then we get the object from the stack
188     if (!stack.isEmpty()){
189       obj = (CustomObject) stack.pop();
190       // Before adding it to the colector, we need to check if is an
191       // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
192       if (obj.getStart().equals(obj.getEnd())){
193         // The element had an end tag and its start was equal to its end. Hence
194         // it is anEmptyAndSpan one.
195         obj.getFM().put("isEmptyAndSpan","true");
196       }// End iff
197       // we add it to the colector
198       colector.add(obj);
199     }// End if
200 
201     // If element has text between, then customize its apearance
202     if ( obj != null &&
203          obj.getStart().longValue() != obj.getEnd().longValue()
204        )
205       // Customize the appearance of the document
206       customizeAppearanceOfDocumentWithEndTag(t);
207 
208     // if t is the </HTML> tag then we reached the end of theHTMLdocument
209     if (t == HTML.Tag.HTML){
210       // replace the old content with the new one
211       doc.setContent (new DocumentContentImpl(tmpDocContent.toString()));
212 
213       // If basicAs is null then get the default annotation
214       // set from this gate document
215       if (basicAS == null)
216         basicAS = doc.getAnnotations(
217                                 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
218 
219       // sort colector ascending on its id
220       Collections.sort(colector);
221       // iterate through colector and construct annotations
222       while (!colector.isEmpty()){
223         obj = (CustomObject) colector.getFirst();
224         colector.remove(obj);
225           // Construct an annotation from this obj
226           try{
227             if (markupElementsMap == null){
228                basicAS.add( obj.getStart(),
229                             obj.getEnd(),
230                             obj.getElemName(),
231                             obj.getFM()
232                            );
233             }else{
234               String annotationType =
235                      (String) markupElementsMap.get(obj.getElemName());
236               if (annotationType != null)
237                  basicAS.add( obj.getStart(),
238                               obj.getEnd(),
239                               annotationType,
240                               obj.getFM()
241                              );
242             }
243           }catch (InvalidOffsetException e){
244               Err.prln("Error creating an annot :" + obj + " Discarded...");
245           }// end try
246 //        }// end if
247       }//while
248 
249       // notify the listener about the total amount of elements that
250       // has been processed
251       fireStatusChangedEvent("Total elements : " + elements);
252 
253     }//else
254 
255   }//handleEndTag
256 
257   /** This method is called when the HTML parser encounts an empty tag
258     */
259   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
260     // fire the status listener if the elements processed exceded the rate
261     if ((++elements % ELEMENTS_RATE) == 0)
262       fireStatusChangedEvent("Processed elements : " + elements);
263 
264     // construct a feature map from the attributes list
265     // these are empty elements
266     FeatureMap fm = Factory.newFeatureMap();
267 
268     // take all the attributes an put them into the feature map
269     if (0 != a.getAttributeCount ()){
270 
271        // Out.println("HAS  attributes = " + a.getAttributeCount ());
272         Enumeration enum = a.getAttributeNames ();
273         while (enum.hasMoreElements ()){
274           Object attribute = enum.nextElement ();
275           fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString());
276 
277         }//while
278 
279     }//if
280 
281     // create the start index of the annotation
282     Long startIndex = new Long(tmpDocContent.length());
283 
284     // initialy the start index is equal with the End index
285     CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
286 
287     // we add the object directly into the colector
288     // we don't add it to the stack because this is an empty tag
289     colector.add(obj);
290 
291     // Just analize the tag t and add some\n chars and spaces to the
292     // tmpDocContent.The reason behind is that we need to have a readable form
293     // for the final document.
294     customizeAppearanceOfDocumentWithSimpleTag(t);
295 
296   } // handleSimpleTag
297 
298   /** This method is called when the HTML parser encounts text (PCDATA)
299     */
300   public void handleText(char[] text, int pos){
301 
302     // Skip the STYLE tag content
303     if(isInsideStyleTag) return;
304 
305     // create a string object based on the reported text
306     String content = new String(text);
307 
308     // remove the difference between JDK 1.3 and JDK 1.4
309     String trimContent = content.trim();
310     if(trimContent.length() == 0) {
311       return;
312     } // if
313 
314     int trimCorrection = content.indexOf(trimContent.charAt(0));
315     content = trimContent;
316 
317     StringBuffer contentBuffer = new StringBuffer("");
318     int tmpDocContentSize = tmpDocContent.length();
319     boolean incrementStartIndex = false;
320     // If the first char of the text just read "text[0]" is NOT whitespace AND
321     // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
322     // concatenation "tmpDocContent + content" will result into a new different
323     // word... and we want to avoid that...
324     if ( tmpDocContentSize != 0 &&
325          content.length() != 0 &&
326          !Character.isWhitespace(content.charAt(0)) &&
327          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
328 
329             contentBuffer.append(" ");
330             incrementStartIndex = true;
331     }// End if
332     // update the document content
333 
334     // put the repositioning information
335     if(reposInfo != null) {
336       int extractedPos = tmpDocContent.length() + contentBuffer.length();
337       addRepositioningInfo(content, pos + trimCorrection, extractedPos);
338     } // if
339 
340     contentBuffer.append(content);
341     // calculate the End index for all the elements of the stack
342     // the expression is : End index = Current doc length + text length
343     Long end = new Long(tmpDocContent.length() + contentBuffer.length());
344 
345     CustomObject obj = null;
346     // Iterate through stack to modify the End index of the existing elements
347 
348     java.util.Iterator anIterator = stack.iterator();
349     while (anIterator.hasNext ()){
350       // get the object and move to the next one
351       obj = (CustomObject) anIterator.next ();
352       if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
353         obj.setStart(new Long(obj.getStart().longValue() + 1));
354       }// End if
355       // sets its End index
356       obj.setEnd(end);
357     }// End while
358 
359     tmpDocContent.append(contentBuffer.toString());
360   }// end handleText();
361 
362   /** For given content the list with shrink position information is searched
363    *  and on the corresponding positions the correct repositioning information
364    *  is calculated and generated.
365    */
366   public void addRepositioningInfo(String content, int pos, int extractedPos) {
367     int contentLength = content.length();
368 
369     // wrong way (without correction and analysing)
370    //reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength);
371 
372     RepositioningInfo.PositionInfo pi = null;
373     long startPos = pos;
374     long correction = 0;
375     long substituteStart;
376     long remainingLen;
377     long offsetInExtracted;
378 
379     for(int i = 0; i < ampCodingInfo.size(); ++i) {
380       pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i);
381       substituteStart = pi.getOriginalPosition();
382 
383       if(substituteStart >= startPos) {
384         if(substituteStart > pos + contentLength + correction) {
385           break; // outside the current text
386         } // if
387 
388         // should create two repositioning information records
389         remainingLen = substituteStart - (startPos + correction);
390         offsetInExtracted = startPos - pos;
391         if(remainingLen > 0) {
392           reposInfo.addPositionInfo(startPos + correction, remainingLen,
393                             extractedPos + offsetInExtracted, remainingLen);
394         } // if
395         // record for shrank text
396         reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(),
397                           extractedPos + offsetInExtracted + remainingLen,
398                           pi.getCurrentLength());
399         startPos = startPos + remainingLen + pi.getCurrentLength();
400         correction += pi.getOriginalLength() - pi.getCurrentLength();
401       } // if
402     } // for
403 
404     // there is some text remaining for repositioning
405     offsetInExtracted = startPos - pos;
406     remainingLen = contentLength - offsetInExtracted;
407     if(remainingLen > 0) {
408       reposInfo.addPositionInfo(startPos + correction, remainingLen,
409                         extractedPos + offsetInExtracted, remainingLen);
410     } // if
411   } // addRepositioningInfo
412 
413   /** This method analizes the tag t and adds some \n chars and spaces to the
414     * tmpDocContent.The reason behind is that we need to have a readable form
415     * for the final document. This method modifies the content of tmpDocContent.
416     * @param t the Html tag encounted by the HTML parser
417     */
418   protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){
419     boolean modification = false;
420     // if the HTML tag is BR then we add a new line character to the document
421     if (HTML.Tag.BR == t){
422       tmpDocContent.append("\n");
423       modification = true;
424     }// End if
425     if (modification == true){
426       Long end = new Long (tmpDocContent.length());
427       java.util.Iterator anIterator = stack.iterator();
428       while (anIterator.hasNext ()){
429         // get the object and move to the next one
430         CustomObject obj = (CustomObject) anIterator.next();
431         // sets its End index
432         obj.setEnd(end);
433       }// End while
434     }//End if
435   }// customizeAppearanceOfDocumentWithSimpleTag
436 
437   /** This method analizes the tag t and adds some \n chars and spaces to the
438     * tmpDocContent.The reason behind is that we need to have a readable form
439     * for the final document. This method modifies the content of tmpDocContent.
440     * @param t the Html tag encounted by the HTML parser
441     */
442   protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){
443     boolean modification = false;
444     if (HTML.Tag.P == t){
445       int tmpDocContentSize = tmpDocContent.length();
446       if ( tmpDocContentSize >= 2 &&
447            '\n' != tmpDocContent.charAt(tmpDocContentSize - 2)
448          ) { tmpDocContent.append("\n"); modification = true;}
449     }// End if
450     if (modification == true){
451       Long end = new Long (tmpDocContent.length());
452       java.util.Iterator anIterator = stack.iterator();
453       while (anIterator.hasNext ()){
454         // get the object and move to the next one
455         CustomObject obj = (CustomObject) anIterator.next();
456         // sets its End index
457         obj.setEnd(end);
458       }// End while
459     }//End if
460   }// customizeAppearanceOfDocumentWithStartTag
461 
462   /** This method analizes the tag t and adds some \n chars and spaces to the
463     * tmpDocContent.The reason behind is that we need to have a readable form
464     * for the final document. This method modifies the content of tmpDocContent.
465     * @param t the Html tag encounted by the HTML parser
466     */
467   protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){
468     boolean modification = false;
469     // if the HTML tag is BR then we add a new line character to the document
470     if ( (HTML.Tag.P == t) ||
471 
472          (HTML.Tag.H1 == t) ||
473          (HTML.Tag.H2 == t) ||
474          (HTML.Tag.H3 == t) ||
475          (HTML.Tag.H4 == t) ||
476          (HTML.Tag.H5 == t) ||
477          (HTML.Tag.H6 == t) ||
478          (HTML.Tag.TR == t) ||
479          (HTML.Tag.CENTER == t) ||
480          (HTML.Tag.LI == t)
481        ){ tmpDocContent.append("\n"); modification = true;}
482 
483     if (HTML.Tag.TITLE == t){
484       tmpDocContent.append("\n\n");
485       modification = true;
486     }// End if
487 
488     if (modification == true){
489       Long end = new Long (tmpDocContent.length());
490       java.util.Iterator anIterator = stack.iterator();
491       while (anIterator.hasNext ()){
492         // get the object and move to the next one
493         CustomObject obj = (CustomObject) anIterator.next();
494         // sets its End index
495         obj.setEnd(end);
496       }// End while
497     }//End if
498   }// customizeAppearanceOfDocumentWithEndTag
499 
500   /**
501     * This method is called when the HTML parser encounts an error
502     * it depends on the programmer if he wants to deal with that error
503     */
504   public void handleError(String errorMsg, int pos) {
505     //Out.println ("ERROR CALLED : " + errorMsg);
506   }
507 
508   /** This method is called once, when the HTML parser reaches the end
509     * of its input streamin order to notify the parserCallback that there
510     * is nothing more to parse.
511     */
512   public void flush() throws BadLocationException{
513   }// flush
514 
515   /** This method is called when the HTML parser encounts a comment
516     */
517   public void handleComment(char[] text, int pos) {
518   }
519 
520   //StatusReporter Implementation
521 
522   public void addStatusListener(StatusListener listener) {
523     myStatusListeners.add(listener);
524   }
525 
526   public void removeStatusListener(StatusListener listener) {
527     myStatusListeners.remove(listener);
528   }
529 
530   protected void fireStatusChangedEvent(String text) {
531     Iterator listenersIter = myStatusListeners.iterator();
532     while(listenersIter.hasNext())
533       ((StatusListener)listenersIter.next()).statusChanged(text);
534   }
535 
536   /**
537     * This method verifies if data contained by the CustomObject can be used
538     * to create a GATE annotation.
539     */
540 /*  private boolean canCreateAnnotation(CustomObject aCustomObject){
541     long start            = aCustomObject.getStart().longValue();
542     long end              = aCustomObject.getEnd().longValue();
543     long gateDocumentSize = doc.getContent().size().longValue();
544 
545     if (start < 0 || end < 0 ) return false;
546     if (start > end ) return false;
547     if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false;
548     return true;
549   }// canCreateAnnotation
550 */
551 
552   // HtmlDocumentHandler member data
553 
554   // this constant indicates when to fire the status listener
555   // this listener will add an overhead and we don't want a big overhead
556   // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
557   final static  int ELEMENTS_RATE = 128;
558 
559   // this map contains the elements name that we want to create
560   // if it's null all the elements from the HTML documents will be transformed
561   // into Gate annotation objects otherwise only the elements it contains will
562   // be transformed
563   private Map markupElementsMap = null;
564 
565   // the content of the HTML document, without any tag
566   // for internal use
567   private StringBuffer tmpDocContent = null;
568 
569   // a stack used to remember elements and to keep the order
570   private java.util.Stack stack = null;
571 
572   // a gate document
573   private gate.Document doc = null;
574 
575   // an annotation set used for creating annotation reffering the doc
576   private gate.AnnotationSet basicAS;
577 
578   // listeners for status report
579   protected List myStatusListeners = new LinkedList();
580 
581   // this reports the the number of elements that have beed processed so far
582   private int elements = 0;
583 
584   protected  long customObjectsId = 0;
585   // we need a colection to retain all the CustomObjects that will be
586   // transformed into annotation over the gate document...
587   // the transformation will take place inside onDocumentEnd() method
588   private LinkedList colector = null;
589 
590   // Inner class
591   /**
592     * The objects belonging to this class are used inside the stack.
593     * This class is for internal needs
594     */
595   class  CustomObject implements Comparable {
596 
597     // constructor
598     public CustomObject(String anElemName, FeatureMap aFm,
599                            Long aStart, Long anEnd) {
600       elemName = anElemName;
601       fm = aFm;
602       start = aStart;
603       end = anEnd;
604       id = new Long(customObjectsId ++);
605     }// End CustomObject()
606 
607     // Methos implemented as required by Comparable interface
608     public int compareTo(Object o){
609       CustomObject obj = (CustomObject) o;
610       return this.id.compareTo(obj.getId());
611     }// compareTo();
612 
613     // accesor
614     public String getElemName() {
615       return elemName;
616     }// getElemName()
617 
618     public FeatureMap getFM() {
619       return fm;
620     }// getFM()
621 
622     public Long getStart() {
623       return start;
624     }// getStart()
625 
626     public Long getEnd() {
627       return end;
628     }// getEnd()
629 
630     public Long getId(){ return id;}
631 
632     // mutator
633     public void setElemName(String anElemName) {
634       elemName = anElemName;
635     }// getElemName()
636 
637     public void setFM(FeatureMap aFm) {
638       fm = aFm;
639     }// setFM();
640 
641     public void setStart(Long aStart) {
642       start = aStart;
643     }// setStart();
644 
645     public void setEnd(Long anEnd) {
646       end = anEnd;
647     }// setEnd();
648 
649     // data fields
650     private String elemName = null;
651     private FeatureMap fm = null;
652     private Long start = null;
653     private Long end  = null;
654     private Long id = null;
655 
656   } // End inner class CustomObject
657 
658 }//End class HtmlDocumentHandler
659 
660 
661 
662