1   /*
2    *  XmlDocumentHandler.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  9 May 2000
12   *
13   *  $Id: XmlDocumentHandler.java,v 1.40 2002/03/19 14:47:33 nasso Exp $
14   */
15  
16  package gate.xml;
17  
18  import java.util.*;
19  
20  import gate.corpora.*;
21  import gate.util.*;
22  import gate.*;
23  import gate.event.*;
24  
25  
26  import org.xml.sax.*;
27  import org.xml.sax.helpers.*;
28  
29  
30  /**
31    * Implements the behaviour of the XML reader
32    * Methods of an object of this class are called by the SAX parser when
33    * events will appear.
34    * The idea is to parse the XML document and construct Gate annotations
35    * objects.
36    * This class also will replace the content of the Gate document with a
37    * new one containing only text from the XML document.
38    */
39  public class XmlDocumentHandler extends XmlPositionCorrectionHandler {
40    /** Debug flag */
41    private static final boolean DEBUG = false;
42  
43    /** Keep the refference to this structure */
44    private RepositioningInfo reposInfo = null;
45  
46    /** Keep the refference to this structure */
47    private RepositioningInfo ampCodingInfo = null;
48  
49    /** Set repositioning information structure refference. If you set this
50     *  refference to <B>null</B> information wouldn't be collected.
51     */
52    public void setRepositioningInfo(RepositioningInfo info) {
53      reposInfo = info;
54    } // setRepositioningInfo
55  
56    /** Return current RepositioningInfo object */
57    public RepositioningInfo getRepositioningInfo() {
58      return reposInfo;
59    } // getRepositioningInfo
60  
61    /** Set repositioning information structure refference for ampersand coding.
62     *  If you set this refference to <B>null</B> information wouldn't be used.
63     */
64    public void setAmpCodingInfo(RepositioningInfo info) {
65      ampCodingInfo = info;
66    } // setRepositioningInfo
67  
68    /** Return current RepositioningInfo object for ampersand coding. */
69    public RepositioningInfo getAmpCodingInfo() {
70      return ampCodingInfo;
71    } // getRepositioningInfo
72  
73    /**
74      * Constructs a XmlDocumentHandler object. The annotationSet set will be the
75      * default one taken from the gate document.
76      * @param aDocument the Gate document that will be processed.
77      * @param aMarkupElementsMap this map contains the elements name that we
78      * want to create.
79      * @param anElement2StringMap this map contains the strings that will be
80      * added to the text contained by the key element.
81      */
82    public XmlDocumentHandler(gate.Document aDocument, Map  aMarkupElementsMap,
83                              Map anElement2StringMap){
84      this(aDocument,aMarkupElementsMap,anElement2StringMap,null);
85    } // XmlDocumentHandler
86  
87    /**
88      * Constructs a XmlDocumentHandler object.
89      * @param aDocument the Gate document that will be processed.
90      * @param aMarkupElementsMap this map contains the elements name that we
91      * want to create.
92      * @param anElement2StringMap this map contains the strings that will be
93      * added to the text contained by the key element.
94      * @param anAnnotationSet is the annotation set that will be filled when the
95      * document was processed
96      */
97    public XmlDocumentHandler(gate.Document       aDocument,
98                              Map                 aMarkupElementsMap,
99                              Map                 anElement2StringMap,
100                             gate.AnnotationSet  anAnnotationSet){
101     // init parent
102     super();
103     // init stack
104     stack = new java.util.Stack();
105 
106     // this string contains the plain text (the text without markup)
107     tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
108 
109     // colector is used later to transform all custom objects into annotation
110     // objects
111     colector = new LinkedList();
112 
113     // the Gate document
114     doc = aDocument;
115 
116     // this map contains the elements name that we want to create
117     // if it's null all the elements from the XML documents will be transformed
118     // into Gate annotation objects
119     markupElementsMap = aMarkupElementsMap;
120 
121     // this map contains the string that we want to insert iside the document
122     // content, when a certain element is found
123     // if the map is null then no string is added
124     element2StringMap = anElement2StringMap;
125 
126     basicAS = anAnnotationSet;
127     customObjectsId = 0;
128   }// XmlDocumentHandler()/
129 
130   /**
131     * This method is called when the SAX parser encounts the beginning of the
132     * XML document.
133     */
134   public void startDocument() throws org.xml.sax.SAXException {
135     // init of variables in the parent
136     super.startDocument();
137   }
138 
139   /**
140     * This method is called when the SAX parser encounts the end of the
141     * XML document.
142     * Here we set the content of the gate Document to be the one generated
143     * inside this class (tmpDocContent).
144     * After that we use the colector to generate all the annotation reffering
145     * this new gate document.
146     */
147   public void endDocument() throws org.xml.sax.SAXException {
148 
149     // replace the document content with the one without markups
150     doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
151 
152     // fire the status listener
153     fireStatusChangedEvent("Total elements: " + elements);
154 
155     // If basicAs is null then get the default AnnotationSet,
156     // based on the gate document.
157     if (basicAS == null)
158       basicAS=doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
159 
160     // sort colector ascending on its id
161     Collections.sort(colector);
162     Set testIdsSet = new HashSet();
163     // create all the annotations (on this new document) from the collector
164     while (!colector.isEmpty()){
165       CustomObject obj = (CustomObject) colector.getFirst();
166       // Test to see if there are two annotation objects with the same id.
167       if (testIdsSet.contains(obj.getId())){
168         throw new GateSaxException("Found two annotations with the same Id("+
169         obj.getId()+
170         ").The document is inconsistent.");
171       }else{
172         testIdsSet.add(obj.getId());
173       }// End iff
174       // create a new annotation and add it to the annotation set
175       try{
176         // the annotation type will be conforming with markupElementsMap
177         //add the annotation to the Annotation Set
178         if (markupElementsMap == null)
179           basicAS.add(  obj.getId(),
180                         obj.getStart(),
181                         obj.getEnd(),
182                         obj.getElemName(),
183                         obj.getFM ());
184         else {
185           // get the type of the annotation from Map
186           String annotationType = (String)
187                                 markupElementsMap.get(obj.getElemName());
188           if (annotationType != null)
189             basicAS.add( obj.getId(),
190                          obj.getStart(),
191                          obj.getEnd(),
192                          annotationType,
193                          obj.getFM());
194         }// End if
195       }catch (gate.util.InvalidOffsetException e){
196         Err.prln("InvalidOffsetException for annot :" + obj.getElemName() +
197          " with Id =" + obj.getId() + ". Discarded...");
198       }// End try
199       colector.remove(obj);
200     }// End while
201   }// endDocument();
202 
203   /**
204     * This method is called when the SAX parser encounts the beginning of an
205     * XML element.
206     */
207   public void startElement (String uri, String qName, String elemName,
208                                                              Attributes atts){
209     // Inform the progress listener to fire only if no of elements processed
210     // so far is a multiple of ELEMENTS_RATE
211     if ((++elements % ELEMENTS_RATE) == 0)
212         fireStatusChangedEvent("Processed elements : " + elements);
213 
214     Integer customObjectId = null;
215     // Construct a SimpleFeatureMapImpl from the list of attributes
216     FeatureMap fm = Factory.newFeatureMap();
217     //Get the name and the value of the attributes and add them to a FeaturesMAP
218     for (int i = 0; i < atts.getLength(); i++) {
219       String attName  = atts.getLocalName(i);
220       String attValue = atts.getValue(i);
221       String attUri =   atts.getURI(i);
222       if (attUri != null && Gate.URI.equals(attUri)){
223         if ("gateId".equals(attName)){
224           customObjectId = new Integer(attValue);
225         }// End if
226         if ("annotMaxId".equals(attName)){
227           customObjectsId = new Integer(attValue).intValue();
228         }// End if
229         if ("matches".equals(attName)){
230           StringTokenizer strTokenizer = new StringTokenizer(attValue,";");
231           List list = new ArrayList();
232           // Take all tokens,create Integers and add them to the list
233           while (strTokenizer.hasMoreTokens()){
234             String token = strTokenizer.nextToken();
235             list.add(new Integer(token));
236           }// End while
237           fm.put(attName,list);
238         }// End if
239       }else{
240         fm.put(attName,attValue);
241       }// End if
242     }// End for
243 
244     // create the START index of the annotation
245     Long startIndex = new Long(tmpDocContent.length());
246 
247     // initialy the Start index is equal with End index
248     CustomObject obj = new CustomObject(customObjectId,elemName,fm,
249                                                  startIndex, startIndex);
250 
251     // put this object into the stack
252     stack.push(obj);
253   }// startElement();
254 
255   /**
256     * This method is called when the SAX parser encounts the end of an
257     * XML element.
258     * Here we extract
259     */
260   public void endElement (String uri, String qName, String elemName )
261                                                          throws SAXException{
262     // obj is for internal use
263     CustomObject obj = null;
264 
265     // if the stack is not empty, we extract the custom object and delete it
266     if (!stack.isEmpty ()){
267       obj = (CustomObject) stack.pop();
268     }// End if
269 
270     // Before adding it to the colector, we need to check if is an
271     // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
272     if (obj.getStart().equals(obj.getEnd())){
273       // The element had an end tag and its start was equal to its end. Hence
274       // it is anEmptyAndSpan one.
275       obj.getFM().put("isEmptyAndSpan","true");
276     }// End iff
277 
278     // Put the object into colector
279     // Later, when the document ends we will use colector to create all the
280     // annotations
281     colector.add(obj);
282 
283     // if element is found on Element2String map, then add the string to the
284     // end of the document content
285     if (element2StringMap != null){
286       String stringFromMap = null;
287 
288       // test to see if element is inside the map
289       // if it is then get the string value and add it to the document content
290       stringFromMap = (String) element2StringMap.get(elemName);
291       if (stringFromMap != null)
292           tmpDocContent.append(stringFromMap);
293     }// End if
294   }// endElement();
295 
296   /**
297     * This method is called when the SAX parser encounts text in the XML doc.
298     * Here we calculate the end indices for all the elements present inside the
299     * stack and update with the new values. For entities, this method is called
300     * separatley regardless of the text sourinding the entity.
301     */
302   public void characters( char[] text,int start,int length) throws SAXException{
303     // correction of real offset. Didn't affect on other data.
304     super.characters(text, start, length);
305     // create a string object based on the reported text
306     String content = new String(text, start, length);
307     StringBuffer contentBuffer = new StringBuffer("");
308     int tmpDocContentSize = tmpDocContent.length();
309     boolean incrementStartIndex = false;
310     // If the first char of the text just read "text[0]" is NOT whitespace AND
311     // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
312     // concatenation "tmpDocContent + content" will result into a new different
313     // word... and we want to avoid that, because the tokenizer, gazetter and
314     // Jape work on the raw text and concatenating tokens might be not good.
315     if ( tmpDocContentSize != 0 &&
316          content.length() != 0 &&
317          !Character.isWhitespace(content.charAt(0)) &&
318          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
319 
320          // If we are here it means that a concatenation between the last
321          // token in the tmpDocContent and the content(which doesn't start
322          // with a white space) will be performed. In order to prevent this,
323          // we will add a " " space char in order to assure that the 2 tokens
324          // stay apart. Howerver we will except from this rule the most known
325          // internal entities like &, <, >, etc
326          if (
327               (
328                  // Testing the length against 1 makes it more likely that
329                  // an internal entity was called. characters() gets called for
330                  // each entity separately.
331                  (content.length() == 1)
332                   &&
333                  (content.charAt(0) == '&' ||
334                   content.charAt(0) == '<' ||
335                   content.charAt(0) == '>' ||
336                   content.charAt(0) == '"' ||
337                   content.charAt(0) == '\''
338                   )
339                ) ||
340                (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' ||
341                 tmpDocContent.charAt(tmpDocContentSize - 1) == '<' ||
342                 tmpDocContent.charAt(tmpDocContentSize - 1) == '>' ||
343                 tmpDocContent.charAt(tmpDocContentSize - 1) == '"' ||
344                 tmpDocContent.charAt(tmpDocContentSize - 1) == '\''
345                )){// do nothing. The content will be appended
346          }else{
347             // In all other cases append " "
348             contentBuffer.append(" ");
349             incrementStartIndex = true;
350         }// End if
351     }// End if
352 
353     // put the repositioning information
354     if(reposInfo != null) {
355       if(! (start == 0 && length == 1 && text.length <= 2)) {
356         // normal piece of text
357         reposInfo.addPositionInfo(getRealOffset(), content.length(),
358                       tmpDocContent.length()+contentBuffer.length(),
359                       content.length());
360         if(DEBUG) {
361           Out.println("Info: "+getRealOffset()+", "+content.length());
362           Out.println("Start: "+start+" len"+length);
363         } // DEBUG
364       }
365       else {
366         // unicode char or &xxx; coding
367         // Reported from the parser offset is 0
368         // The real offset should be found in the ampCodingInfo structure.
369 
370         long lastPosition = 0;
371         RepositioningInfo.PositionInfo pi;
372 
373         if(reposInfo.size() > 0) {
374           pi =
375             (RepositioningInfo.PositionInfo) reposInfo.get(reposInfo.size()-1);
376           lastPosition = pi.getOriginalPosition();
377         } // if
378 
379         for(int i = 0; i < ampCodingInfo.size(); ++i) {
380           pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i);
381           if(pi.getOriginalPosition() > lastPosition) {
382             // found
383             reposInfo.addPositionInfo(pi.getOriginalPosition(),
384                           pi.getOriginalLength(),
385                           tmpDocContent.length()+contentBuffer.length(),
386                           content.length());
387             break;
388           } // if
389         } // for
390       } // if
391     } // if
392 
393     // update the document content
394     contentBuffer.append(content);
395     // calculate the End index for all the elements of the stack
396     // the expression is : End index = Current doc length + text length
397     Long end = new Long(tmpDocContent.length() + contentBuffer.length());
398 
399     CustomObject obj = null;
400     // Iterate through stack to modify the End index of the existing elements
401 
402     java.util.Iterator anIterator = stack.iterator();
403     while (anIterator.hasNext ()){
404       // get the object and move to the next one
405       obj = (CustomObject) anIterator.next ();
406       if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
407         obj.setStart(new Long(obj.getStart().longValue() + 1));
408       }// End if
409       // sets its End index
410       obj.setEnd(end);
411     }// End while
412 
413     tmpDocContent.append(contentBuffer.toString());
414   }// characters();
415 
416   /**
417     * This method is called when the SAX parser encounts white spaces
418     */
419   public void ignorableWhitespace(char ch[],int start,int length) throws
420                                                                    SAXException{
421 
422     // internal String object
423     String  text = new String(ch, start, length);
424     // if the last character in tmpDocContent is \n and the read whitespace is
425     // \n then don't add it to tmpDocContent...
426 
427     if (tmpDocContent.length () != 0)
428       if (tmpDocContent.charAt (tmpDocContent.length () - 1) != '\n' ||
429         !text.equalsIgnoreCase("\n")
430       )
431          tmpDocContent.append(text);
432   }
433 
434   /**
435     * Error method.We deal with this exception inside SimpleErrorHandler class
436     */
437   public void error(SAXParseException ex) throws SAXException {
438     // deal with a SAXParseException
439     // see SimpleErrorhandler class
440     _seh.error(ex);
441   }
442 
443   /**
444     * FatalError method.
445     */
446   public void fatalError(SAXParseException ex) throws SAXException {
447     // deal with a SAXParseException
448     // see SimpleErrorhandler class
449     _seh.fatalError(ex);
450   }
451 
452   /**
453     * Warning method comment.
454     */
455   public void warning(SAXParseException ex) throws SAXException {
456     // deal with a SAXParseException
457     // see SimpleErrorhandler class
458     _seh.warning(ex);
459   }
460 
461   /**
462     * This method is called when the SAX parser encounts a comment
463     * It works only if the XmlDocumentHandler implements a
464     * com.sun.parser.LexicalEventListener
465     */
466   public void comment(String text) throws SAXException {
467     // create a FeatureMap and then add the comment to the annotation set.
468     /*
469     gate.util.SimpleFeatureMapImpl fm = new gate.util.SimpleFeatureMapImpl();
470     fm.put ("text_comment",text);
471     Long node = new Long (tmpDocContent.length());
472     CustomObject anObject = new CustomObject("Comment",fm,node,node);
473     colector.add(anObject);
474     */
475   }
476 
477   /**
478     * This method is called when the SAX parser encounts a start of a CDATA
479     * section
480     * It works only if the XmlDocumentHandler implements a
481     * com.sun.parser.LexicalEventListener
482     */
483   public void startCDATA()throws SAXException {
484   }
485 
486   /**
487     * This method is called when the SAX parser encounts the end of a CDATA
488     * section.
489     * It works only if the XmlDocumentHandler implements a
490     * com.sun.parser.LexicalEventListener
491     */
492   public void endCDATA() throws SAXException {
493   }
494 
495   /**
496     * This method is called when the SAX parser encounts a parsed Entity
497     * It works only if the XmlDocumentHandler implements a
498     * com.sun.parser.LexicalEventListener
499     */
500   public void startParsedEntity(String name) throws SAXException {
501   }
502 
503   /**
504     * This method is called when the SAX parser encounts a parsed entity and
505     * informs the application if that entity was parsed or not
506     * It's working only if the CustomDocumentHandler implements a
507     *  com.sun.parser.LexicalEventListener
508     */
509   public void endParsedEntity(String name, boolean included)throws SAXException{
510   }
511 
512   //StatusReporter Implementation
513 
514   /**
515     * This methos is called when a listener is registered with this class
516     */
517   public void addStatusListener(StatusListener listener){
518     myStatusListeners.add(listener);
519   }
520   /**
521     * This methos is called when a listener is removed
522     */
523   public void removeStatusListener(StatusListener listener){
524     myStatusListeners.remove(listener);
525   }
526   /**
527     * This methos is called whenever we need to inform the listener about an
528     * event.
529   */
530   protected void fireStatusChangedEvent(String text){
531     Iterator listenersIter = myStatusListeners.iterator();
532     while(listenersIter.hasNext())
533       ((StatusListener)listenersIter.next()).statusChanged(text);
534   }
535 
536   /** This method is a workaround of the java 4 non namespace supporting parser
537     * It receives a qualified name and returns its local name.
538     * For eg. if it receives gate:gateId it will return gateId
539     */
540   private String getMyLocalName(String aQName){
541     if (aQName == null) return "";
542     StringTokenizer strToken = new StringTokenizer(aQName,":");
543     if (strToken.countTokens()<= 1) return aQName;
544     // The nr of tokens is >= than 2
545     // Skip the first token which is the QName
546     strToken.nextToken();
547     return strToken.nextToken();
548   }//getMyLocalName()
549 
550   /** Also a workaround for URI identifier. If the QName is gate it will return
551     *  GATE's. Otherwhise it will return the empty string
552     */
553   private String getMyURI(String aQName){
554     if (aQName == null) return "";
555     StringTokenizer strToken = new StringTokenizer(aQName,":");
556     if (strToken.countTokens()<= 1) return "";
557     // If first token is "gate" then return GATE's URI
558     if ("gate".equalsIgnoreCase(strToken.nextToken()))
559       return Gate.URI;
560     return "";
561   }// getMyURI()
562 
563   // XmlDocumentHandler member data
564 
565   // this constant indicates when to fire the status listener
566   // this listener will add an overhead and we don't want a big overhead
567   // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
568   final static  int ELEMENTS_RATE = 128;
569 
570   // this map contains the elements name that we want to create
571   // if it's null all the elements from the XML documents will be transformed
572   // into Gate annotation objects otherwise only the elements it contains will
573   // be transformed
574   private Map markupElementsMap = null;
575 
576   // this map contains the string that we want to insert iside the document
577   // content, when a certain element is found
578   // if the map is null then no string is added
579   private Map element2StringMap = null;
580 
581   /**This object inducates what to do when the parser encounts an error*/
582   private SimpleErrorHandler _seh = new SimpleErrorHandler();
583 
584   /**The content of the XML document, without any tag for internal use*/
585   private StringBuffer tmpDocContent = null;
586 
587   /**A stack used to remember elements and to keep the order */
588   private java.util.Stack stack = null;
589 
590   /**A gate document */
591   private gate.Document doc = null;
592 
593   /**An annotation set used for creating annotation reffering the doc */
594   private gate.AnnotationSet basicAS = null;
595 
596   /**Listeners for status report */
597   protected List myStatusListeners = new LinkedList();
598 
599   /**This reports the the number of elements that have beed processed so far*/
600   private int elements = 0;
601 
602   /** We need a colection to retain all the CustomObjects that will be
603     * transformed into annotation over the gate document...
604     * the transformation will take place inside onDocumentEnd() method
605     */
606   private LinkedList colector = null;
607 
608   /** This is used to generate unique Ids for the CustomObjects read*/
609   protected  int customObjectsId = 0;
610 
611   /** Accesor method for the customObjectsId field*/
612   public int getCustomObjectsId(){ return customObjectsId;}
613 
614   //////// INNER CLASS
615   /**
616     * The objects belonging to this class are used inside the stack.
617     * This class is for internal needs
618     */
619   class  CustomObject implements Comparable {
620 
621     // constructor
622     public CustomObject(Integer anId,String anElemName, FeatureMap aFm,
623                            Long aStart, Long anEnd) {
624       elemName = anElemName;
625       fm = aFm;
626       start = aStart;
627       end = anEnd;
628       if (anId == null){
629         id = new Integer(customObjectsId ++);
630       }else{
631         id = anId;
632         if (customObjectsId <= anId.intValue())
633           customObjectsId = anId.intValue() + 1 ;
634       }// End if
635     }// End CustomObject()
636 
637     // Methos implemented as required by Comparable interface
638     public int compareTo(Object o){
639       CustomObject obj = (CustomObject) o;
640       return this.id.compareTo(obj.getId());
641     }// compareTo();
642 
643     // accesor
644     public String getElemName() {
645       return elemName;
646     }// getElemName()
647 
648     public FeatureMap getFM() {
649       return fm;
650     }// getFM()
651 
652     public Long getStart() {
653       return start;
654     }// getStart()
655 
656     public Long getEnd() {
657       return end;
658     }// getEnd()
659 
660     public Integer getId(){ return id;}
661 
662     // mutator
663     public void setElemName(String anElemName) {
664       elemName = anElemName;
665     }// getElemName()
666 
667     public void setFM(FeatureMap aFm) {
668       fm = aFm;
669     }// setFM();
670 
671     public void setStart(Long aStart) {
672       start = aStart;
673     }// setStart();
674 
675     public void setEnd(Long anEnd) {
676       end = anEnd;
677     }// setEnd();
678 
679     // data fields
680     private String elemName = null;
681     private FeatureMap fm = null;
682     private Long start = null;
683     private Long end  = null;
684     private Integer id = null;
685 
686   } // End inner class CustomObject
687 
688 } //XmlDocumentHandler
689 
690 
691 
692