1   /*
2    *  XmlDocumentHandler.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  9 May 2000
12   *
13   *  $Id: XmlDocumentHandler.java,v 1.43 2002/10/29 10:21:51 valyt Exp $
14   */
15  
16  package gate.xml;
17  
18  import java.util.*;
19  
20  import gate.corpora.*;
21  import gate.util.*;
22  import gate.*;
23  import gate.event.*;
24  
25  
26  import org.xml.sax.*;
27  import org.xml.sax.helpers.*;
28  
29  
30  /**
31    * Implements the behaviour of the XML reader
32    * Methods of an object of this class are called by the SAX parser when
33    * events will appear.
34    * The idea is to parse the XML document and construct Gate annotations
35    * objects.
36    * This class also will replace the content of the Gate document with a
37    * new one containing only text from the XML document.
38    */
39  public class XmlDocumentHandler extends XmlPositionCorrectionHandler {
40    /** Debug flag */
41    private static final boolean DEBUG = false;
42  
43    /** Keep the refference to this structure */
44    private RepositioningInfo reposInfo = null;
45  
46    /** Keep the refference to this structure */
47    private RepositioningInfo ampCodingInfo = null;
48  
49    /** Set repositioning information structure refference. If you set this
50     *  refference to <B>null</B> information wouldn't be collected.
51     */
52    public void setRepositioningInfo(RepositioningInfo info) {
53      reposInfo = info;
54    } // setRepositioningInfo
55  
56    /** Return current RepositioningInfo object */
57    public RepositioningInfo getRepositioningInfo() {
58      return reposInfo;
59    } // getRepositioningInfo
60  
61    /** Set repositioning information structure refference for ampersand coding.
62     *  If you set this refference to <B>null</B> information wouldn't be used.
63     */
64    public void setAmpCodingInfo(RepositioningInfo info) {
65      ampCodingInfo = info;
66    } // setRepositioningInfo
67  
68    /** Return current RepositioningInfo object for ampersand coding. */
69    public RepositioningInfo getAmpCodingInfo() {
70      return ampCodingInfo;
71    } // getRepositioningInfo
72  
73    /**
74      * Constructs a XmlDocumentHandler object. The annotationSet set will be the
75      * default one taken from the gate document.
76      * @param aDocument the Gate document that will be processed.
77      * @param aMarkupElementsMap this map contains the elements name that we
78      * want to create.
79      * @param anElement2StringMap this map contains the strings that will be
80      * added to the text contained by the key element.
81      */
82    public XmlDocumentHandler(gate.Document aDocument, Map  aMarkupElementsMap,
83                              Map anElement2StringMap){
84      this(aDocument,aMarkupElementsMap,anElement2StringMap,null);
85    } // XmlDocumentHandler
86  
87    /**
88      * Constructs a XmlDocumentHandler object.
89      * @param aDocument the Gate document that will be processed.
90      * @param aMarkupElementsMap this map contains the elements name that we
91      * want to create.
92      * @param anElement2StringMap this map contains the strings that will be
93      * added to the text contained by the key element.
94      * @param anAnnotationSet is the annotation set that will be filled when the
95      * document was processed
96      */
97    public XmlDocumentHandler(gate.Document       aDocument,
98                              Map                 aMarkupElementsMap,
99                              Map                 anElement2StringMap,
100                             gate.AnnotationSet  anAnnotationSet){
101     // init parent
102     super();
103     // init stack
104     stack = new java.util.Stack();
105 
106     // this string contains the plain text (the text without markup)
107     tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
108 
109     // colector is used later to transform all custom objects into annotation
110     // objects
111     colector = new LinkedList();
112 
113     // the Gate document
114     doc = aDocument;
115 
116     // this map contains the elements name that we want to create
117     // if it's null all the elements from the XML documents will be transformed
118     // into Gate annotation objects
119     markupElementsMap = aMarkupElementsMap;
120 
121     // this map contains the string that we want to insert iside the document
122     // content, when a certain element is found
123     // if the map is null then no string is added
124     element2StringMap = anElement2StringMap;
125 
126     basicAS = anAnnotationSet;
127     customObjectsId = 0;
128   }// XmlDocumentHandler()/
129 
130   /**
131     * This method is called when the SAX parser encounts the beginning of the
132     * XML document.
133     */
134   public void startDocument() throws org.xml.sax.SAXException {
135     // init of variables in the parent
136     super.startDocument();
137   }
138 
139   /**
140     * This method is called when the SAX parser encounts the end of the
141     * XML document.
142     * Here we set the content of the gate Document to be the one generated
143     * inside this class (tmpDocContent).
144     * After that we use the colector to generate all the annotation reffering
145     * this new gate document.
146     */
147   public void endDocument() throws org.xml.sax.SAXException {
148 
149     // replace the document content with the one without markups
150     doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
151 
152     // fire the status listener
153     fireStatusChangedEvent("Total elements: " + elements);
154 
155     // If basicAs is null then get the default AnnotationSet,
156     // based on the gate document.
157     if (basicAS == null)
158       basicAS=doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
159 
160     // sort colector ascending on its id
161     Collections.sort(colector);
162     Set testIdsSet = new HashSet();
163     // create all the annotations (on this new document) from the collector
164     while (!colector.isEmpty()){
165       CustomObject obj = (CustomObject) colector.getFirst();
166       // Test to see if there are two annotation objects with the same id.
167       if (testIdsSet.contains(obj.getId())){
168         throw new GateSaxException("Found two annotations with the same Id("+
169         obj.getId()+
170         ").The document is inconsistent.");
171       }else{
172         testIdsSet.add(obj.getId());
173       }// End iff
174       // create a new annotation and add it to the annotation set
175       try{
176         // the annotation type will be conforming with markupElementsMap
177         //add the annotation to the Annotation Set
178         if (markupElementsMap == null)
179           basicAS.add(  obj.getId(),
180                         obj.getStart(),
181                         obj.getEnd(),
182                         obj.getElemName(),
183                         obj.getFM ());
184         else {
185           // get the type of the annotation from Map
186           String annotationType = (String)
187                                 markupElementsMap.get(obj.getElemName());
188           if (annotationType != null)
189             basicAS.add( obj.getId(),
190                          obj.getStart(),
191                          obj.getEnd(),
192                          annotationType,
193                          obj.getFM());
194         }// End if
195       }catch (gate.util.InvalidOffsetException e){
196         Err.prln("InvalidOffsetException for annot :" + obj.getElemName() +
197          " with Id =" + obj.getId() + ". Discarded...");
198       }// End try
199       colector.remove(obj);
200     }// End while
201   }// endDocument();
202 
203   /**
204     * This method is called when the SAX parser encounts the beginning of an
205     * XML element.
206     */
207   public void startElement (String uri, String qName, String elemName,
208                                                              Attributes atts){
209     // Inform the progress listener to fire only if no of elements processed
210     // so far is a multiple of ELEMENTS_RATE
211     if ((++elements % ELEMENTS_RATE) == 0)
212         fireStatusChangedEvent("Processed elements : " + elements);
213 
214     Integer customObjectId = null;
215     // Construct a SimpleFeatureMapImpl from the list of attributes
216     FeatureMap fm = Factory.newFeatureMap();
217     //Get the name and the value of the attributes and add them to a FeaturesMAP
218     for (int i = 0; i < atts.getLength(); i++) {
219       String attName  = atts.getLocalName(i);
220       String attValue = atts.getValue(i);
221       String attUri =   atts.getURI(i);
222       if (attUri != null && Gate.URI.equals(attUri)){
223         if ("gateId".equals(attName)){
224           customObjectId = new Integer(attValue);
225         }// End if
226         if ("annotMaxId".equals(attName)){
227           customObjectsId = new Integer(attValue).intValue();
228         }// End if
229         if ("matches".equals(attName)){
230           StringTokenizer strTokenizer = new StringTokenizer(attValue,";");
231           List list = new ArrayList();
232           // Take all tokens,create Integers and add them to the list
233           while (strTokenizer.hasMoreTokens()){
234             String token = strTokenizer.nextToken();
235             list.add(new Integer(token));
236           }// End while
237           fm.put(attName,list);
238         }// End if
239       }else{
240         fm.put(atts.getQName(i), attValue);
241       }// End if
242     }// End for
243 
244     // create the START index of the annotation
245     Long startIndex = new Long(tmpDocContent.length());
246 
247     // initialy the Start index is equal with End index
248     CustomObject obj = new CustomObject(customObjectId,elemName,fm,
249                                                  startIndex, startIndex);
250 
251     // put this object into the stack
252     stack.push(obj);
253   }// startElement();
254 
255   /**
256     * This method is called when the SAX parser encounts the end of an
257     * XML element.
258     * Here we extract
259     */
260   public void endElement (String uri, String qName, String elemName )
261                                                          throws SAXException{
262     // obj is for internal use
263     CustomObject obj = null;
264 
265     // if the stack is not empty, we extract the custom object and delete it
266     if (!stack.isEmpty ()){
267       obj = (CustomObject) stack.pop();
268     }// End if
269 
270     // Before adding it to the colector, we need to check if is an
271     // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
272     if (obj.getStart().equals(obj.getEnd())){
273       // The element had an end tag and its start was equal to its end. Hence
274       // it is anEmptyAndSpan one.
275       obj.getFM().put("isEmptyAndSpan","true");
276     }// End iff
277 
278     // Put the object into colector
279     // Later, when the document ends we will use colector to create all the
280     // annotations
281     colector.add(obj);
282 
283     // if element is found on Element2String map, then add the string to the
284     // end of the document content
285     if (element2StringMap != null){
286       String stringFromMap = null;
287 
288       // test to see if element is inside the map
289       // if it is then get the string value and add it to the document content
290       stringFromMap = (String) element2StringMap.get(elemName);
291       if (stringFromMap != null)
292           tmpDocContent.append(stringFromMap);
293     }// End if
294   }// endElement();
295 
296   /**
297     * This method is called when the SAX parser encounts text in the XML doc.
298     * Here we calculate the end indices for all the elements present inside the
299     * stack and update with the new values. For entities, this method is called
300     * separatley regardless of the text sourinding the entity.
301     */
302   public void characters( char[] text,int start,int length) throws SAXException{
303     // correction of real offset. Didn't affect on other data.
304     super.characters(text, start, length);
305     // create a string object based on the reported text
306     String content = new String(text, start, length);
307     StringBuffer contentBuffer = new StringBuffer("");
308     int tmpDocContentSize = tmpDocContent.length();
309     boolean incrementStartIndex = false;
310     boolean addExtraSpace = true;
311     if ( Gate.getUserConfig().get(
312           GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME)!= null)
313       addExtraSpace =
314         Gate.getUserConfig().getBoolean(
315           GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME
316         ).booleanValue();
317     // If the first char of the text just read "text[0]" is NOT whitespace AND
318     // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
319     // concatenation "tmpDocContent + content" will result into a new different
320     // word... and we want to avoid that, because the tokenizer, gazetter and
321     // Jape work on the raw text and concatenating tokens might be not good.
322     if ( tmpDocContentSize != 0 &&
323          content.length() != 0 &&
324          !Character.isWhitespace(content.charAt(0)) &&
325          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
326 
327          // If we are here it means that a concatenation between the last
328          // token in the tmpDocContent and the content(which doesn't start
329          // with a white space) will be performed. In order to prevent this,
330          // we will add a " " space char in order to assure that the 2 tokens
331          // stay apart. Howerver we will except from this rule the most known
332          // internal entities like &, <, >, etc
333          if (
334               (
335                  // Testing the length against 1 makes it more likely that
336                  // an internal entity was called. characters() gets called for
337                  // each entity separately.
338                  (content.length() == 1)
339                   &&
340                  (content.charAt(0) == '&' ||
341                   content.charAt(0) == '<' ||
342                   content.charAt(0) == '>' ||
343                   content.charAt(0) == '"' ||
344                   content.charAt(0) == '\''
345                   )
346                ) ||
347                (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' ||
348                 tmpDocContent.charAt(tmpDocContentSize - 1) == '<' ||
349                 tmpDocContent.charAt(tmpDocContentSize - 1) == '>' ||
350                 tmpDocContent.charAt(tmpDocContentSize - 1) == '"' ||
351                 tmpDocContent.charAt(tmpDocContentSize - 1) == '\''
352                )){// do nothing. The content will be appended
353          }else if (!addExtraSpace) {
354          }else
355            {
356             // In all other cases append " "
357             contentBuffer.append(" ");
358             incrementStartIndex = true;
359         }// End if
360     }// End if
361 
362     // put the repositioning information
363     if(reposInfo != null) {
364       if(! (start == 0 && length == 1 && text.length <= 2)) {
365         // normal piece of text
366         reposInfo.addPositionInfo(getRealOffset(), content.length(),
367                       tmpDocContent.length()+contentBuffer.length(),
368                       content.length());
369         if(DEBUG) {
370           Out.println("Info: "+getRealOffset()+", "+content.length());
371           Out.println("Start: "+start+" len"+length);
372         } // DEBUG
373       }
374       else {
375         // unicode char or &xxx; coding
376         // Reported from the parser offset is 0
377         // The real offset should be found in the ampCodingInfo structure.
378 
379         long lastPosition = 0;
380         RepositioningInfo.PositionInfo pi;
381 
382         if(reposInfo.size() > 0) {
383           pi =
384             (RepositioningInfo.PositionInfo) reposInfo.get(reposInfo.size()-1);
385           lastPosition = pi.getOriginalPosition();
386         } // if
387 
388         for(int i = 0; i < ampCodingInfo.size(); ++i) {
389           pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i);
390           if(pi.getOriginalPosition() > lastPosition) {
391             // found
392             reposInfo.addPositionInfo(pi.getOriginalPosition(),
393                           pi.getOriginalLength(),
394                           tmpDocContent.length()+contentBuffer.length(),
395                           content.length());
396             break;
397           } // if
398         } // for
399       } // if
400     } // if
401 
402     // update the document content
403     contentBuffer.append(content);
404     // calculate the End index for all the elements of the stack
405     // the expression is : End index = Current doc length + text length
406     Long end = new Long(tmpDocContent.length() + contentBuffer.length());
407 
408     CustomObject obj = null;
409     // Iterate through stack to modify the End index of the existing elements
410 
411     java.util.Iterator anIterator = stack.iterator();
412     while (anIterator.hasNext ()){
413       // get the object and move to the next one
414       obj = (CustomObject) anIterator.next ();
415       if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
416         obj.setStart(new Long(obj.getStart().longValue() + 1));
417       }// End if
418       // sets its End index
419       obj.setEnd(end);
420     }// End while
421 
422     tmpDocContent.append(contentBuffer.toString());
423   }// characters();
424 
425   /**
426     * This method is called when the SAX parser encounts white spaces
427     */
428   public void ignorableWhitespace(char ch[],int start,int length) throws
429                                                                    SAXException{
430 
431     // internal String object
432     String  text = new String(ch, start, length);
433     // if the last character in tmpDocContent is \n and the read whitespace is
434     // \n then don't add it to tmpDocContent...
435 
436     if (tmpDocContent.length () != 0)
437       if (tmpDocContent.charAt (tmpDocContent.length () - 1) != '\n' ||
438         !text.equalsIgnoreCase("\n")
439       )
440          tmpDocContent.append(text);
441   }
442 
443   /**
444     * Error method.We deal with this exception inside SimpleErrorHandler class
445     */
446   public void error(SAXParseException ex) throws SAXException {
447     // deal with a SAXParseException
448     // see SimpleErrorhandler class
449     _seh.error(ex);
450   }
451 
452   /**
453     * FatalError method.
454     */
455   public void fatalError(SAXParseException ex) throws SAXException {
456     // deal with a SAXParseException
457     // see SimpleErrorhandler class
458     _seh.fatalError(ex);
459   }
460 
461   /**
462     * Warning method comment.
463     */
464   public void warning(SAXParseException ex) throws SAXException {
465     // deal with a SAXParseException
466     // see SimpleErrorhandler class
467     _seh.warning(ex);
468   }
469 
470   /**
471     * This method is called when the SAX parser encounts a comment
472     * It works only if the XmlDocumentHandler implements a
473     * com.sun.parser.LexicalEventListener
474     */
475   public void comment(String text) throws SAXException {
476     // create a FeatureMap and then add the comment to the annotation set.
477     /*
478     gate.util.SimpleFeatureMapImpl fm = new gate.util.SimpleFeatureMapImpl();
479     fm.put ("text_comment",text);
480     Long node = new Long (tmpDocContent.length());
481     CustomObject anObject = new CustomObject("Comment",fm,node,node);
482     colector.add(anObject);
483     */
484   }
485 
486   /**
487     * This method is called when the SAX parser encounts a start of a CDATA
488     * section
489     * It works only if the XmlDocumentHandler implements a
490     * com.sun.parser.LexicalEventListener
491     */
492   public void startCDATA()throws SAXException {
493   }
494 
495   /**
496     * This method is called when the SAX parser encounts the end of a CDATA
497     * section.
498     * It works only if the XmlDocumentHandler implements a
499     * com.sun.parser.LexicalEventListener
500     */
501   public void endCDATA() throws SAXException {
502   }
503 
504   /**
505     * This method is called when the SAX parser encounts a parsed Entity
506     * It works only if the XmlDocumentHandler implements a
507     * com.sun.parser.LexicalEventListener
508     */
509   public void startParsedEntity(String name) throws SAXException {
510   }
511 
512   /**
513     * This method is called when the SAX parser encounts a parsed entity and
514     * informs the application if that entity was parsed or not
515     * It's working only if the CustomDocumentHandler implements a
516     *  com.sun.parser.LexicalEventListener
517     */
518   public void endParsedEntity(String name, boolean included)throws SAXException{
519   }
520 
521   //StatusReporter Implementation
522 
523   /**
524     * This methos is called when a listener is registered with this class
525     */
526   public void addStatusListener(StatusListener listener){
527     myStatusListeners.add(listener);
528   }
529   /**
530     * This methos is called when a listener is removed
531     */
532   public void removeStatusListener(StatusListener listener){
533     myStatusListeners.remove(listener);
534   }
535   /**
536     * This methos is called whenever we need to inform the listener about an
537     * event.
538   */
539   protected void fireStatusChangedEvent(String text){
540     Iterator listenersIter = myStatusListeners.iterator();
541     while(listenersIter.hasNext())
542       ((StatusListener)listenersIter.next()).statusChanged(text);
543   }
544 
545   /** This method is a workaround of the java 4 non namespace supporting parser
546     * It receives a qualified name and returns its local name.
547     * For eg. if it receives gate:gateId it will return gateId
548     */
549   private String getMyLocalName(String aQName){
550     if (aQName == null) return "";
551     StringTokenizer strToken = new StringTokenizer(aQName,":");
552     if (strToken.countTokens()<= 1) return aQName;
553     // The nr of tokens is >= than 2
554     // Skip the first token which is the QName
555     strToken.nextToken();
556     return strToken.nextToken();
557   }//getMyLocalName()
558 
559   /** Also a workaround for URI identifier. If the QName is gate it will return
560     *  GATE's. Otherwhise it will return the empty string
561     */
562   private String getMyURI(String aQName){
563     if (aQName == null) return "";
564     StringTokenizer strToken = new StringTokenizer(aQName,":");
565     if (strToken.countTokens()<= 1) return "";
566     // If first token is "gate" then return GATE's URI
567     if ("gate".equalsIgnoreCase(strToken.nextToken()))
568       return Gate.URI;
569     return "";
570   }// getMyURI()
571 
572   // XmlDocumentHandler member data
573 
574   // this constant indicates when to fire the status listener
575   // this listener will add an overhead and we don't want a big overhead
576   // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
577   final static  int ELEMENTS_RATE = 128;
578 
579   // this map contains the elements name that we want to create
580   // if it's null all the elements from the XML documents will be transformed
581   // into Gate annotation objects otherwise only the elements it contains will
582   // be transformed
583   private Map markupElementsMap = null;
584 
585   // this map contains the string that we want to insert iside the document
586   // content, when a certain element is found
587   // if the map is null then no string is added
588   private Map element2StringMap = null;
589 
590   /**This object inducates what to do when the parser encounts an error*/
591   private SimpleErrorHandler _seh = new SimpleErrorHandler();
592 
593   /**The content of the XML document, without any tag for internal use*/
594   private StringBuffer tmpDocContent = null;
595 
596   /**A stack used to remember elements and to keep the order */
597   private java.util.Stack stack = null;
598 
599   /**A gate document */
600   private gate.Document doc = null;
601 
602   /**An annotation set used for creating annotation reffering the doc */
603   private gate.AnnotationSet basicAS = null;
604 
605   /**Listeners for status report */
606   protected List myStatusListeners = new LinkedList();
607 
608   /**This reports the the number of elements that have beed processed so far*/
609   private int elements = 0;
610 
611   /** We need a colection to retain all the CustomObjects that will be
612     * transformed into annotation over the gate document...
613     * the transformation will take place inside onDocumentEnd() method
614     */
615   private LinkedList colector = null;
616 
617   /** This is used to generate unique Ids for the CustomObjects read*/
618   protected  int customObjectsId = 0;
619 
620   /** Accesor method for the customObjectsId field*/
621   public int getCustomObjectsId(){ return customObjectsId;}
622 
623   //////// INNER CLASS
624   /**
625     * The objects belonging to this class are used inside the stack.
626     * This class is for internal needs
627     */
628   class  CustomObject implements Comparable {
629 
630     // constructor
631     public CustomObject(Integer anId,String anElemName, FeatureMap aFm,
632                            Long aStart, Long anEnd) {
633       elemName = anElemName;
634       fm = aFm;
635       start = aStart;
636       end = anEnd;
637       if (anId == null){
638         id = new Integer(customObjectsId ++);
639       }else{
640         id = anId;
641         if (customObjectsId <= anId.intValue())
642           customObjectsId = anId.intValue() + 1 ;
643       }// End if
644     }// End CustomObject()
645 
646     // Methos implemented as required by Comparable interface
647     public int compareTo(Object o){
648       CustomObject obj = (CustomObject) o;
649       return this.id.compareTo(obj.getId());
650     }// compareTo();
651 
652     // accesor
653     public String getElemName() {
654       return elemName;
655     }// getElemName()
656 
657     public FeatureMap getFM() {
658       return fm;
659     }// getFM()
660 
661     public Long getStart() {
662       return start;
663     }// getStart()
664 
665     public Long getEnd() {
666       return end;
667     }// getEnd()
668 
669     public Integer getId(){ return id;}
670 
671     // mutator
672     public void setElemName(String anElemName) {
673       elemName = anElemName;
674     }// getElemName()
675 
676     public void setFM(FeatureMap aFm) {
677       fm = aFm;
678     }// setFM();
679 
680     public void setStart(Long aStart) {
681       start = aStart;
682     }// setStart();
683 
684     public void setEnd(Long anEnd) {
685       end = anEnd;
686     }// setEnd();
687 
688     // data fields
689     private String elemName = null;
690     private FeatureMap fm = null;
691     private Long start = null;
692     private Long end  = null;
693     private Integer id = null;
694 
695   } // End inner class CustomObject
696 
697 } //XmlDocumentHandler
698 
699 
700 
701