1   /*
2    *  DocumentImpl.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentImpl.java,v 1.131 2004/07/23 11:33:20 kalina Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.IOException;
19  import java.net.URL;
20  import java.util.*;
21  
22  import gate.*;
23  import gate.annotation.AnnotationSetImpl;
24  import gate.creole.AbstractLanguageResource;
25  import gate.creole.ResourceInstantiationException;
26  import gate.event.*;
27  import gate.util.*;
28  
29  /** Represents the commonalities between all sorts of documents.
30    *
31    * <H2>Editing</H2>
32    *
33    * <P>
34    * The DocumentImpl class implements the Document interface.
35    * The DocumentContentImpl class models the textual or audio-visual
36    * materials which are the source and content of Documents.
37    * The AnnotationSetImpl class supplies annotations on Documents.
38    *
39    * <P>
40    * Abbreviations:
41    *
42    * <UL>
43    * <LI>
44    * DC = DocumentContent
45    * <LI>
46    * D = Document
47    * <LI>
48    * AS = AnnotationSet
49    * </UL>
50    *
51    * <P>
52    * We add an edit method to each of these classes; for DC and AS
53    * the methods are package private; D has the public method.
54    *
55    * <PRE>
56    *   void edit(Long start, Long end, DocumentContent replacement)
57    *   throws InvalidOffsetException;
58    * </PRE>
59    *
60    * <P>
61    * D receives edit requests and forwards them to DC and AS.
62    * On DC, this method makes a change to the content - e.g. replacing
63    * a String range from start to end with replacement. (Deletions
64    * are catered for by having replacement = null.) D then calls
65    * AS.edit on each of its annotation sets.
66    *
67    * <P>
68    * On AS, edit calls replacement.size() (i.e. DC.size()) to
69    * figure out how long the replacement is (0 for null). It then
70    * considers annotations that terminate (start or end) in
71    * the altered or deleted range as invalid; annotations that
72    * terminate after the range have their offsets adjusted.
73    * I.e.:
74    * <UL>
75    * <LI>
76    * the nodes that pointed inside the old modified area are invalid now and
77    * will be deleted along with the connected annotations;
78    * <LI>
79    * the nodes that are before the start of the modified area remain
80    * untouched;
81    * <LI>
82    * the nodes that are after the end of the affected area will have the
83    * offset changed according to the formula below.
84    * </UL>
85    *
86    * <P>
87    * A note re. AS and annotations: annotations no longer have
88    * offsets as in the old model, they now have nodes, and nodes
89    * have offsets.
90    *
91    * <P>
92    * To implement AS.edit, we have several indices:
93    * <PRE>
94    *   HashMap annotsByStartNode, annotsByEndNode;
95    * </PRE>
96    * which map node ids to annotations;
97    * <PRE>
98    *   RBTreeMap nodesByOffset;
99    * </PRE>
100   * which maps offset to Nodes.
101   *
102   * <P>
103   * When we get an edit request, we traverse that part of the
104   * nodesByOffset tree representing the altered or deleted
105   * range of the DC. For each node found, we delete any annotations
106   * that terminate on the node, and then delete the node itself.
107   * We then traverse the rest of the tree, changing the offset
108   * on all remaining nodes by:
109   * <PRE>
110   *   newOffset =
111   *     oldOffset -
112   *     (
113   *       (end - start) -                                     // size of mod
114   *       ( (replacement == null) ? 0 : replacement.size() )  // size of repl
115   *     );
116   * </PRE>
117   * Note that we use the same convention as e.g. java.lang.String: start
118   * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd"
119   * range 1-3 = "bc". Examples, for a node with offset 4:
120   * <PRE>
121   * edit(1, 3, "BC");
122   * newOffset = 4 - ( (3 - 1) - 2 ) = 4
123   *
124   * edit(1, 3, null);
125   * newOffset = 4 - ( (3 - 1) - 0 ) = 2
126   *
127   * edit(1, 3, "BBCC");
128   * newOffset = 4 - ( (3 - 1) - 4 ) = 6
129   * </PRE>
130   */
131 public class DocumentImpl
132 extends AbstractLanguageResource implements TextualDocument, CreoleListener,
133                                             DatastoreListener {
134   /** Debug flag */
135   private static final boolean DEBUG = false;
136 
137   /** If you set this flag to true the original content of the document will
138    *  be kept in the document feature. <br>
139    *  Default value is false to avoid the unnecessary waste of memory */
140   private Boolean preserveOriginalContent = new Boolean(false);
141 
142   /** If you set this flag to true the repositioning information for
143    *  the document will be kept in the document feature. <br>
144    *  Default value is false to avoid the unnecessary waste of time and memory
145    */
146   private Boolean collectRepositioningInfo = new Boolean(false);
147 
148   /**
149    * This is a variable which contains the latest crossed over annotation
150    * found during export with preserving format, i.e., toXml(annotations)
151    * method.
152    */
153   private Annotation crossedOverAnnotation = null;
154 
155   /** Default construction. Content left empty. */
156   public DocumentImpl() {
157     content = new DocumentContentImpl();
158     stringContent = "";
159   } // default construction
160 
161   /** Cover unpredictable Features creation */
162   public FeatureMap getFeatures() {
163     if (features == null) {
164       features = new SimpleFeatureMapImpl();
165     }
166     return features;
167   }
168 
169   /** Initialise this resource, and return it. */
170   public Resource init() throws ResourceInstantiationException {
171     // set up the source URL and create the content
172     if(sourceUrl == null) {
173       if(stringContent == null) {
174         throw new ResourceInstantiationException(
175           "The sourceURL and document's content were null."
176         );
177       }
178 
179       content = new DocumentContentImpl(stringContent);
180       getFeatures().put("gate.SourceURL", "created from String");
181     } else {
182       try {
183         content = new DocumentContentImpl(
184           sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
185         getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
186       } catch(IOException e) {
187         e.printStackTrace();
188         throw new ResourceInstantiationException("DocumentImpl.init: " + e);
189       }
190 
191       if(preserveOriginalContent.booleanValue() && content != null) {
192         String originalContent = new String(
193           ((DocumentContentImpl) content).getOriginalContent());
194         getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
195                       originalContent);
196       } // if
197     }
198 
199     // set up a DocumentFormat if markup unpacking required
200     if(getMarkupAware().booleanValue()) {
201       DocumentFormat docFormat =
202         DocumentFormat.getDocumentFormat(this, sourceUrl);
203       try {
204         if(docFormat != null){
205           StatusListener sListener = (StatusListener)
206                                       gate.gui.MainFrame.getListeners().
207                                       get("gate.event.StatusListener");
208           if(sListener != null) docFormat.addStatusListener(sListener);
209 
210           // set the flag if true and if the document format support collecting
211           docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
212 
213           if(docFormat.getShouldCollectRepositioning().booleanValue()) {
214             // unpack with collectiong of repositioning information
215             RepositioningInfo info = new RepositioningInfo();
216 
217             String origContent = (String) getFeatures().get(
218                 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
219 
220             RepositioningInfo ampCodingInfo = new RepositioningInfo();
221             if(origContent != null) {
222               boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
223               collectInformationForAmpCodding(origContent, ampCodingInfo,
224                                               shouldCorrectCR);
225               if(docFormat instanceof HtmlDocumentFormat) {
226                 collectInformationForWS(origContent, ampCodingInfo);
227               } // if
228             } // if
229 
230             docFormat.unpackMarkup(this, info, ampCodingInfo);
231 
232             if(origContent != null
233                 && docFormat instanceof XmlDocumentFormat) {
234               // CRLF correction of RepositioningInfo
235               correctRepositioningForCRLFInXML(origContent, info);
236             } // if
237 
238             getFeatures().put(
239                 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
240           }
241           else {
242             // normal old fashioned unpack
243             docFormat.unpackMarkup(this);
244           }
245           docFormat.removeStatusListener(sListener);
246        } //if format != null
247       } catch(DocumentFormatException e) {
248         throw new ResourceInstantiationException(
249           "Couldn't unpack markup in document " + sourceUrl.toExternalForm() +
250           " " + e
251         );
252       }
253     } // if markup aware
254 
255 //try{
256 //  FileWriter fw = new FileWriter("d:/temp/doccontent.txt");
257 //  fw.write(getContent().toString());
258 //  fw.flush();
259 //  fw.close();
260 //}catch(IOException ioe){
261 //  ioe.printStackTrace();
262 //}
263 
264     return this;
265   } // init()
266 
267   /**
268    * Correct repositioning information for substitution of "\r\n" with "\n"
269    */
270   private void correctRepositioningForCRLFInXML(String content,
271                                             RepositioningInfo info) {
272     int index = -1;
273 
274     do {
275       index = content.indexOf("\r\n", index+1);
276       if(index != -1) {
277         info.correctInformationOriginalMove(index, 1);
278       } // if
279     } while(index != -1);
280   } // correctRepositioningForCRLF
281 
282   /**
283    * Collect information for substitution of "&xxx;" with "y"
284    *
285    * It couldn't be collected a position information about
286    * some unicode and &-coded symbols during parsing. The parser "hide" the
287    * information about the position of such kind of parsed text.
288    * So, there is minimal chance to have &-coded symbol inside the covered by
289    * repositioning records area. The new record should be created for every
290    * coded symbol outside the existing records.
291    * <BR>
292    * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction
293    * for CRLF substitution is performed.
294    */
295   private void collectInformationForAmpCodding(String content,
296                                             RepositioningInfo info,
297                                             boolean shouldCorrectCR) {
298 
299     if(content == null || info == null) return;
300 
301     int ampIndex = -1;
302     int semiIndex;
303 
304     do {
305       ampIndex = content.indexOf('&', ampIndex+1);
306       if(ampIndex != -1) {
307         semiIndex = content.indexOf(';', ampIndex+1);
308         // have semicolon and it is near enough for amp codding
309         if(semiIndex != -1 && (semiIndex-ampIndex) < 8) {
310           info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1);
311         }
312         else {
313           // no semicolon or it is too far
314           // analyse for amp codding without semicolon
315           int maxEnd = Math.min(ampIndex+8, content.length());
316           String ampCandidate = content.substring(ampIndex, maxEnd);
317           int ampCodingSize = analyseAmpCodding(ampCandidate);
318 
319           if(ampCodingSize != -1) {
320             info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
321           } // if
322 
323         } // if - semicolon found
324       } // if - ampersand found
325     } while (ampIndex != -1);
326 
327     // correct the collected information to adjust it's positions
328     // with reported by the parser
329     int index = -1;
330 
331     if(shouldCorrectCR) {
332       do {
333         index = content.indexOf("\r\n", index+1);
334         if(index != -1) {
335           info.correctInformationOriginalMove(index, -1);
336         } // if
337       } while(index != -1);
338     } // if
339   } // collectInformationForAmpCodding
340 
341   /**
342    * This function compute size of the ampersand codded sequence when
343    * semicolin is not present.
344    */
345   private int analyseAmpCodding(String content) {
346     int result = -1;
347 
348     try {
349       char ch = content.charAt(1);
350 
351       switch(ch) {
352         case 'l' : // &lt
353         case 'L' : // &lt
354           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
355             result = 3;
356           } // if
357           break;
358         case 'g' : // &gt
359         case 'G' : // &gt
360           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
361             result = 3;
362           } // if
363           break;
364         case 'a' : // &amp
365         case 'A' : // &amp
366           if(content.substring(2, 4).equalsIgnoreCase("mp")) {
367             result = 4;
368           } // if
369           break;
370         case 'q' : // &quot
371         case 'Q' : // &quot
372           if(content.substring(2, 5).equalsIgnoreCase("uot")) {
373             result = 5;
374           } // if
375           break;
376         case '#' : // #number (example &#145, &#x4C38)
377           int endIndex = 2;
378           boolean hexCoded = false;
379           if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
380             // Hex codding
381             ++endIndex;
382             hexCoded = true;
383           } // if
384 
385           while (endIndex < 8
386                   && isNumber(content.charAt(endIndex), hexCoded) ) {
387             ++endIndex;
388           } // while
389           result = endIndex;
390           break;
391       } // switch
392     } catch (StringIndexOutOfBoundsException ex) {
393       // do nothing
394     } // catch
395 
396     return result;
397   } // analyseAmpCodding
398 
399   /** Check for numeric range. If hex is true the A..F range is included */
400   private boolean isNumber(char ch, boolean hex) {
401     if(ch >= '0' && ch <= '9') return true;
402 
403     if(hex) {
404       if(ch >= 'A' && ch <= 'F') return true;
405       if(ch >= 'a' && ch <= 'f') return true;
406     } // if
407 
408     return false;
409   } // isNumber
410 
411   /** HTML parser perform substitution of multiple whitespaces (WS) with
412    *  a single WS. To create correct repositioning information structure we
413    *  should keep the information for such multiple WS.
414    *  <BR>
415    *  The criteria for WS is <code>(ch <= ' ')</code>.
416    */
417   private void collectInformationForWS(String content, RepositioningInfo info) {
418 
419     if(content == null || info == null) return;
420 
421     // analyse the content and correct the repositioning information
422     char ch;
423     int startWS, endWS;
424 
425     startWS = endWS = -1;
426     int contentLength = content.length();
427 
428     for(int i=0; i<contentLength; ++i) {
429       ch = content.charAt(i);
430 
431       // is whitespace
432       if(ch <= ' ') {
433         if(startWS == -1) {
434           startWS = i;
435         } // if
436         endWS = i;
437       }
438       else {
439         if(endWS - startWS > 0) {
440           // put the repositioning information about the WS substitution
441           info.addPositionInfo(
442             (long)startWS, (long)(endWS - startWS + 1), 0, 1);
443         } // if
444         // clear positions
445         startWS = endWS = -1;
446       }// if
447     } // for
448   } // collectInformationForWS
449 
450   /** Clear all the data members of the object. */
451   public void cleanup() {
452 
453     defaultAnnots = null;
454     if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
455         namedAnnotSets.clear();
456     if (DEBUG) Out.prln("Document cleanup called");
457     if (this.lrPersistentId != null)
458       Gate.getCreoleRegister().removeCreoleListener(this);
459     if(this.getDataStore() != null)
460       this.getDataStore().removeDatastoreListener(this);
461   } // cleanup()
462 
463 
464   /** Documents are identified by URLs */
465   public URL getSourceUrl() { return sourceUrl; }
466 
467   /** Set method for the document's URL */
468   public void setSourceUrl(URL sourceUrl) {
469     this.sourceUrl = sourceUrl;
470   } // setSourceUrl
471 
472   /** Documents may be packed within files; in this case an optional pair of
473     * offsets refer to the location of the document.
474     */
475   public Long[] getSourceUrlOffsets() {
476     Long[] sourceUrlOffsets = new Long[2];
477     sourceUrlOffsets[0] = sourceUrlStartOffset;
478     sourceUrlOffsets[1] = sourceUrlEndOffset;
479     return sourceUrlOffsets;
480   } // getSourceUrlOffsets
481 
482   /**
483    * Allow/disallow preserving of the original document content.
484    * If is <B>true</B> the original content will be retrieved from
485    * the DocumentContent object and preserved as document feature.
486    */
487   public void setPreserveOriginalContent(Boolean b) {
488     preserveOriginalContent = b;
489   } // setPreserveOriginalContent
490 
491   /** Get the preserving of content status of the Document.
492    *
493    *  @return whether the Document should preserve it's original content.
494    */
495   public Boolean getPreserveOriginalContent() {
496     return preserveOriginalContent;
497   } // getPreserveOriginalContent
498 
499   /**
500    *  Allow/disallow collecting of repositioning information.
501    *  If is <B>true</B> information will be retrieved and preserved
502    *  as document feature.<BR>
503    *  Preserving of repositioning information give the possibilities
504    *  for converting of coordinates between the original document content and
505    *  extracted from the document text.
506    */
507   public void setCollectRepositioningInfo(Boolean b) {
508     collectRepositioningInfo = b;
509   } // setCollectRepositioningInfo
510 
511   /** Get the collectiong and preserving of repositioning information
512    *  for the Document. <BR>
513    *  Preserving of repositioning information give the possibilities
514    *  for converting of coordinates between the original document content and
515    *  extracted from the document text.
516    *
517    *  @return whether the Document should collect and preserve information.
518    */
519   public Boolean getCollectRepositioningInfo() {
520     return collectRepositioningInfo;
521   } // getCollectRepositioningInfo
522 
523   /** Documents may be packed within files; in this case an optional pair of
524     * offsets refer to the location of the document. This method gets the
525     * start offset.
526     */
527   public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
528 
529   /** Documents may be packed within files; in this case an optional pair of
530     * offsets refer to the location of the document. This method sets the
531     * start offset.
532     */
533   public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
534     this.sourceUrlStartOffset = sourceUrlStartOffset;
535   } // setSourceUrlStartOffset
536 
537   /** Documents may be packed within files; in this case an optional pair of
538     * offsets refer to the location of the document. This method gets the
539     * end offset.
540     */
541   public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
542 
543   /** Documents may be packed within files; in this case an optional pair of
544     * offsets refer to the location of the document. This method sets the
545     * end offset.
546     */
547   public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
548     this.sourceUrlEndOffset = sourceUrlEndOffset;
549   } // setSourceUrlStartOffset
550 
551   /** The content of the document: a String for text; MPEG for video; etc. */
552   public DocumentContent getContent() { return content; }
553 
554   /** Set method for the document content */
555   public void setContent(DocumentContent content) {
556     this.content = content;
557     this.stringContent = content.toString();
558   }
559 
560   /** Get the encoding of the document content source */
561   public String getEncoding() {
562     //we need to make sure we ALWAYS have an encoding
563     if(encoding == null || encoding.trim().length() == 0){
564       //no encoding definded: use the platform default
565       encoding = java.nio.charset.Charset.forName(
566           System.getProperty("file.encoding")).name();
567     }
568     return encoding;
569   }
570 
571   /** Set the encoding of the document content source */
572   public void setEncoding(String encoding) { this.encoding = encoding; }
573 
574   /** Get the default set of annotations. The set is created if it
575     * doesn't exist yet.
576     */
577   public AnnotationSet getAnnotations() {
578     if(defaultAnnots == null){
579       defaultAnnots = new AnnotationSetImpl(this);
580       fireAnnotationSetAdded(new DocumentEvent(
581            this, DocumentEvent.ANNOTATION_SET_ADDED, null));
582     }//if
583     return defaultAnnots;
584   } // getAnnotations()
585 
586   /** Get a named set of annotations. Creates a new set if one with this
587     * name doesn't exist yet.
588     * If the provided name is null then it returns the default annotation set.
589     */
590   public AnnotationSet getAnnotations(String name) {
591     if(name == null) return getAnnotations();
592     if(namedAnnotSets == null)
593       namedAnnotSets = new HashMap();
594     AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
595 
596     if(namedSet == null) {
597       namedSet = new AnnotationSetImpl(this, name);
598       namedAnnotSets.put(name, namedSet);
599 
600       DocumentEvent evt = new DocumentEvent(
601         this, DocumentEvent.ANNOTATION_SET_ADDED, name
602       );
603       fireAnnotationSetAdded(evt);
604     }
605     return namedSet;
606   } // getAnnotations(name)
607 
608   /** Make the document markup-aware. This will trigger the creation
609    *  of a DocumentFormat object at Document initialisation time; the
610    *  DocumentFormat object will unpack the markup in the Document and
611    *  add it as annotations. Documents are <B>not</B> markup-aware by default.
612    *
613    *  @param newMarkupAware markup awareness status.
614    */
615   public void setMarkupAware(Boolean newMarkupAware) {
616       this.markupAware = newMarkupAware;
617   }
618 
619   /** Get the markup awareness status of the Document.
620    *  <B>Documents are markup-aware by default.</B>
621    *  @return whether the Document is markup aware.
622    */
623   public Boolean getMarkupAware() { return markupAware; }
624 
625   /** Returns an XML document aming to preserve the original markups(
626     * the original markup will be in the same place and format as it was
627     * before processing the document) and include (if possible)
628     * the annotations specified in the aSourceAnnotationSet.
629     * It is equivalent to toXml(aSourceAnnotationSet, true).
630     */
631   public String toXml(Set aSourceAnnotationSet){
632     return toXml(aSourceAnnotationSet, true);
633   }
634 
635   /** Returns an XML document aming to preserve the original markups(
636     * the original markup will be in the same place and format as it was
637     * before processing the document) and include (if possible)
638     * the annotations specified in the aSourceAnnotationSet.
639     * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost
640     * if they will cause a crosed over situation.
641     * @param aSourceAnnotationSet is an annotation set containing all the
642     * annotations that will be combined with the original marup set. If the
643     * param is <code>null</code> it will only dump the original markups.
644     * @param includeFeatures is a boolean that controls whether the annotation
645     * features should be included or not. If false, only the annotation type
646     * is included in the tag.
647     * @return a string representing an XML document containing the original
648     * markup + dumped annotations form the aSourceAnnotationSet
649     */
650   public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){
651 
652     if(hasOriginalContentFeatures()) {
653       return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures);
654     } // if
655 
656     AnnotationSet originalMarkupsAnnotSet =
657             this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
658 
659     // Create a dumping annotation set on the document. It will be used for
660     // dumping annotations...
661 //    AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
662     List dumpingList = new ArrayList(originalMarkupsAnnotSet.size());
663 
664     // This set will be constructed inside this method. If is not empty, the
665     // annotation contained will be lost.
666 /*    if (!dumpingSet.isEmpty()){
667       Out.prln("WARNING: The dumping annotation set was not empty."+
668       "All annotation it contained were lost.");
669       dumpingSet.clear();
670     }// End if
671 */
672     StatusListener sListener = (StatusListener)
673                                gate.gui.MainFrame.getListeners().
674                                get("gate.event.StatusListener");
675     // Construct the dumping set in that way that all annotations will verify
676     // the condition that there are not annotations which are crossed.
677     // First add all annotation from the original markups
678     if(sListener != null)
679       sListener.statusChanged("Constructing the dumping annotation set.");
680 //    dumpingSet.addAll(originalMarkupsAnnotSet);
681     dumpingList.addAll(originalMarkupsAnnotSet);
682     // Then take all the annotations from aSourceAnnotationSet and verify if
683     // they can be inserted safely into the dumpingSet. Where not possible,
684     // report.
685     if (aSourceAnnotationSet != null){
686       Iterator iter = aSourceAnnotationSet.iterator();
687       while (iter.hasNext()){
688         Annotation currentAnnot = (Annotation) iter.next();
689         if(insertsSafety(dumpingList,currentAnnot)){
690 //          dumpingSet.add(currentAnnot);
691           dumpingList.add(currentAnnot);
692         }else if (crossedOverAnnotation != null && DEBUG){
693           try {
694             Out.prln("Warning: Annotations were found to violate the " +
695             "crossed over condition: \n" +
696             "1. [" +
697             getContent().getContent(
698                            crossedOverAnnotation.getStartNode().getOffset(),
699                            crossedOverAnnotation.getEndNode().getOffset()) +
700             " (" + crossedOverAnnotation.getType() + ": " +
701             crossedOverAnnotation.getStartNode().getOffset() +
702             ";" + crossedOverAnnotation.getEndNode().getOffset() +
703             ")]\n" +
704             "2. [" +
705             getContent().getContent(
706                            currentAnnot.getStartNode().getOffset(),
707                            currentAnnot.getEndNode().getOffset()) +
708             " (" + currentAnnot.getType() + ": " +
709             currentAnnot.getStartNode().getOffset() +
710             ";" + currentAnnot.getEndNode().getOffset() +
711             ")]\nThe second one will be discarded.\n"  );
712           } catch (gate.util.InvalidOffsetException ex) {
713             throw new GateRuntimeException(ex.getMessage());
714           }
715         }// End if
716       }// End while
717     }// End if
718 
719     //kalina: order the dumping list by start offset
720     Collections.sort(dumpingList, new gate.util.OffsetComparator());
721 
722     // The dumpingSet is ready to be exported as XML
723     // Here we go.
724     if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
725     StringBuffer xmlDoc = new StringBuffer(
726           DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
727 
728     // Add xml header if original format was xml
729     String mimeType = getFeatures() == null ?
730                       null :
731                       (String)getFeatures().get("MimeType");
732     boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
733 
734     if(wasXML){
735       xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
736       xmlDoc.append(getEncoding());
737       xmlDoc.append("\" ?>");
738       xmlDoc.append(Strings.getNl());
739     }// ENd if
740     // Identify and extract the root annotation from the dumpingSet.
741     theRootAnnotation = identifyTheRootAnnotation(dumpingList);
742     // If a root annotation has been identified then add it eplicitley at the
743     // beginning of the document
744     if (theRootAnnotation != null){
745       dumpingList.remove(theRootAnnotation);
746       xmlDoc.append(writeStartTag(theRootAnnotation,includeFeatures));
747     }// End if
748     // Construct and append the rest of the document
749     xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
750     // If a root annotation has been identified then add it eplicitley at the
751     // end of the document
752     if (theRootAnnotation != null){
753       xmlDoc.append(writeEndTag(theRootAnnotation));
754     }// End if
755 
756     if(sListener != null) sListener.statusChanged("Done.");
757     return xmlDoc.toString();
758   }//End toXml()
759 
760   /** This method verifies if aSourceAnnotation can ve inserted safety into the
761     * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
762     * contition with any annotation from the aTargetAnnotSet.
763     * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation
764     * @param aSourceAnnotation the annotation to be inserted into the
765     * aTargetAnnotSet
766     * @return true if the annotation inserts safety, or false otherwise.
767     */
768   private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
769                                                 Annotation aSourceAnnotation){
770 
771     if (aTargetAnnotSet == null || aSourceAnnotation == null) {
772       this.crossedOverAnnotation = null;
773       return false;
774     }
775     if (aSourceAnnotation.getStartNode() == null ||
776         aSourceAnnotation.getStartNode().getOffset()== null) {
777       this.crossedOverAnnotation = null;
778       return false;
779     }
780     if (aSourceAnnotation.getEndNode() == null ||
781         aSourceAnnotation.getEndNode().getOffset()== null) {
782       this.crossedOverAnnotation = null;
783       return false;
784     }
785 
786     // Get the start and end offsets
787     Long start = aSourceAnnotation.getStartNode().getOffset();
788     Long end =   aSourceAnnotation.getEndNode().getOffset();
789     // Read aSourceAnnotation offsets long
790     long s2 = start.longValue();
791     long e2 = end.longValue();
792 
793     // Obtain a set with all annotations annotations that overlap
794     // totaly or partially with the interval defined by the two provided offsets
795     AnnotationSet as = aTargetAnnotSet.get(start,end);
796 
797     // Investigate all the annotations from as to see if there is one that
798     // comes in conflict with aSourceAnnotation
799     Iterator it = as.iterator();
800     while(it.hasNext()){
801       Annotation ann = (Annotation) it.next();
802       // Read ann offsets
803       long s1 = ann.getStartNode().getOffset().longValue();
804       long e1 = ann.getEndNode().getOffset().longValue();
805 
806       if (s1<s2 && s2<e1 && e1<e2) {
807         this.crossedOverAnnotation = ann;
808         return false;
809       }
810       if (s2<s1 && s1<e2 && e2<e1) {
811         this.crossedOverAnnotation = ann;
812         return false;
813       }
814     }// End while
815     return true;
816   }// insertsSafety()
817 
818   private boolean insertsSafety(List aTargetAnnotList,
819                                                 Annotation aSourceAnnotation){
820 
821     if (aTargetAnnotList == null || aSourceAnnotation == null) {
822       this.crossedOverAnnotation = null;
823       return false;
824     }
825     if (aSourceAnnotation.getStartNode() == null ||
826         aSourceAnnotation.getStartNode().getOffset()== null) {
827       this.crossedOverAnnotation = null;
828       return false;
829     }
830     if (aSourceAnnotation.getEndNode() == null ||
831         aSourceAnnotation.getEndNode().getOffset()== null) {
832       this.crossedOverAnnotation = null;
833       return false;
834     }
835 
836     // Get the start and end offsets
837     Long start = aSourceAnnotation.getStartNode().getOffset();
838     Long end =   aSourceAnnotation.getEndNode().getOffset();
839     // Read aSourceAnnotation offsets long
840     long s2 = start.longValue();
841     long e2 = end.longValue();
842 
843     // Obtain a set with all annotations annotations that overlap
844     // totaly or partially with the interval defined by the two provided offsets
845     List as = new ArrayList();
846     for (int i=0; i < aTargetAnnotList.size(); i++) {
847       Annotation annot = (Annotation) aTargetAnnotList.get(i);
848       if (annot.getStartNode().getOffset().longValue() >= s2
849           &&
850           annot.getStartNode().getOffset().longValue() <= e2)
851         as.add(annot);
852       else if (annot.getEndNode().getOffset().longValue() >= s2
853           &&
854           annot.getEndNode().getOffset().longValue() <= e2)
855         as.add(annot);
856     }
857 
858     // Investigate all the annotations from as to see if there is one that
859     // comes in conflict with aSourceAnnotation
860     Iterator it = as.iterator();
861     while(it.hasNext()){
862       Annotation ann = (Annotation) it.next();
863       // Read ann offsets
864       long s1 = ann.getStartNode().getOffset().longValue();
865       long e1 = ann.getEndNode().getOffset().longValue();
866 
867       if (s1<s2 && s2<e1 && e1<e2) {
868         this.crossedOverAnnotation = ann;
869         return false;
870       }
871       if (s2<s1 && s1<e2 && e2<e1) {
872         this.crossedOverAnnotation = ann;
873         return false;
874       }
875     }// End while
876     return true;
877   }// insertsSafety()
878 
879   /** This method saves all the annotations from aDumpAnnotSet and combines
880     * them with the document content.
881     * @param aDumpAnnotSet is a GATE annotation set prepared to be used
882     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
883     * then an empty string will be returned.
884     * @param includeFeatures is a boolean, which controls whether the annotation
885     * features and gate ID are included or not.
886     * @return The XML document obtained from raw text + the information from
887     * the dump annotation set.
888     */
889   private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
890                                         boolean includeFeatures){
891     String content = null;
892     if (this.getContent()== null)
893       content = new String("");
894     else
895       content = this.getContent().toString();
896     StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
897     if (aDumpAnnotSet == null)   return docContStrBuff.toString();
898 
899     TreeMap offsets2CharsMap = new TreeMap();
900     if (this.getContent().size().longValue() != 0){
901       // Fill the offsets2CharsMap with all the indices where
902       // special chars appear
903       buildEntityMapFromString(content,offsets2CharsMap);
904     }//End if
905     // The saving alghorithm is as follows:
906     ///////////////////////////////////////////
907     // Construct a set of annot with all IDs in asc order.
908     // All annotations that end at that offset swap their place in descending
909     // order. For each node write all the tags from left to right.
910 
911     // Construct the node set
912     TreeSet offsets = new TreeSet();
913     Iterator iter = aDumpAnnotSet.iterator();
914     while (iter.hasNext()){
915       Annotation annot = (Annotation) iter.next();
916       offsets.add(annot.getStartNode().getOffset());
917       offsets.add(annot.getEndNode().getOffset());
918     }// End while
919 
920     // ofsets is sorted in ascending order.
921     // Iterate this set in descending order and remove an offset at each
922     // iteration
923     while (!offsets.isEmpty()){
924       Long offset = (Long)offsets.last();
925       // Remove the offset from the set
926       offsets.remove(offset);
927       // Now, use it.
928       // Returns a list with annotations that needs to be serialized in that
929       // offset.
930       List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
931       // Attention: the annotation are serialized from left to right
932 //      StringBuffer tmpBuff = new StringBuffer("");
933       StringBuffer tmpBuff = new StringBuffer(
934           DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
935       Stack stack = new Stack();
936       // Iterate through all these annotations and serialize them
937       Iterator it = annotations.iterator();
938       while(it.hasNext()){
939         Annotation a = (Annotation) it.next();
940         it.remove();
941         // Test if a Ends at offset
942         if ( offset.equals(a.getEndNode().getOffset()) ){
943           // Test if a Starts at offset
944           if ( offset.equals(a.getStartNode().getOffset()) ){
945             // Here, the annotation a Starts and Ends at the offset
946             if ( null != a.getFeatures().get("isEmptyAndSpan") &&
947                  "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
948 
949               // Assert: annotation a with start == end and isEmptyAndSpan
950               tmpBuff.append(writeStartTag(a, includeFeatures));
951               stack.push(a);
952             }else{
953               // Assert annotation a with start == end and an empty tag
954               tmpBuff.append(writeEmptyTag(a));
955               // The annotation is removed from dumped set
956               aDumpAnnotSet.remove(a);
957             }// End if
958           }else{
959             // Here the annotation a Ends at the offset.
960             // In this case empty the stack and write the end tag
961             if (!stack.isEmpty()){
962               while(!stack.isEmpty()){
963                 Annotation a1 = (Annotation)stack.pop();
964                 tmpBuff.append(writeEndTag(a1));
965               }// End while
966             }// End if
967             tmpBuff.append(writeEndTag(a));
968           }// End if
969         }else{
970           // The annotation a does NOT end at the offset. Let's see if it starts
971           // at the offset
972           if ( offset.equals(a.getStartNode().getOffset()) ){
973             // The annotation a starts at the offset.
974             // In this case empty the stack and write the end tag
975             if (!stack.isEmpty()){
976               while(!stack.isEmpty()){
977                 Annotation a1 = (Annotation)stack.pop();
978                 tmpBuff.append(writeEndTag(a1));
979               }// End while
980             }// End if
981             tmpBuff.append(writeStartTag(a, includeFeatures));
982             // The annotation is removed from dumped set
983             aDumpAnnotSet.remove(a);
984           }// End if ( offset.equals(a.getStartNode().getOffset()) )
985         }// End if ( offset.equals(a.getEndNode().getOffset()) )
986       }// End while(it.hasNext()){
987 
988       // In this case empty the stack and write the end tag
989       if (!stack.isEmpty()){
990         while(!stack.isEmpty()){
991           Annotation a1 = (Annotation)stack.pop();
992           tmpBuff.append(writeEndTag(a1));
993         }// End while
994       }// End if
995 
996       // Before inserting tmpBuff into docContStrBuff we need to check
997       // if there are chars to be replaced and if there are, they would be
998       // replaced.
999       if (!offsets2CharsMap.isEmpty()){
1000        Long offsChar = (Long) offsets2CharsMap.lastKey();
1001        while( !offsets2CharsMap.isEmpty() &&
1002                       offsChar.intValue() >= offset.intValue()){
1003          // Replace the char at offsChar with its corresponding entity form
1004          // the entitiesMap.
1005          docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1006          (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1007          // Discard the offsChar after it was used.
1008          offsets2CharsMap.remove(offsChar);
1009          // Investigate next offsChar
1010          if (!offsets2CharsMap.isEmpty())
1011            offsChar = (Long) offsets2CharsMap.lastKey();
1012        }// End while
1013      }// End if
1014      // Insert tmpBuff to the location where it belongs in docContStrBuff
1015      docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1016    }// End while(!offsets.isEmpty())
1017    // Need to replace the entities in the remaining text, if there is any text
1018    // So, if there are any more items in offsets2CharsMap they need to be
1019    // replaced
1020    while (!offsets2CharsMap.isEmpty()){
1021      Long offsChar = (Long) offsets2CharsMap.lastKey();
1022      // Replace the char with its entity
1023      docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1024      (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1025      // remove the offset from the map
1026      offsets2CharsMap.remove(offsChar);
1027    }// End while
1028    return docContStrBuff.toString();
1029  }// saveAnnotationSetAsXml()
1030
1031  private String saveAnnotationSetAsXml(List aDumpAnnotList,
1032                                        boolean includeFeatures){
1033    String content = null;
1034    if (this.getContent()== null)
1035      content = new String("");
1036    else
1037      content = this.getContent().toString();
1038    StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1039    if (aDumpAnnotList == null)   return docContStrBuff.toString();
1040
1041    StringBuffer resultStrBuff = new StringBuffer(
1042        DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1043    // last offset position used to extract portions of text
1044    Long lastOffset = new Long(0);
1045
1046    TreeMap offsets2CharsMap = new TreeMap();
1047    HashMap annotsForOffset = new HashMap(100);
1048    if (this.getContent().size().longValue() != 0){
1049      // Fill the offsets2CharsMap with all the indices where
1050      // special chars appear
1051      buildEntityMapFromString(content,offsets2CharsMap);
1052    }//End if
1053    // The saving alghorithm is as follows:
1054    ///////////////////////////////////////////
1055    // Construct a set of annot with all IDs in asc order.
1056    // All annotations that end at that offset swap their place in descending
1057    // order. For each node write all the tags from left to right.
1058
1059    // Construct the node set
1060    TreeSet offsets = new TreeSet();
1061    Iterator iter = aDumpAnnotList.iterator();
1062    Annotation annot;
1063    Long start;
1064    Long end;
1065    while (iter.hasNext()){
1066      annot = (Annotation) iter.next();
1067      start = annot.getStartNode().getOffset();
1068      end = annot.getEndNode().getOffset();
1069      offsets.add(start);
1070      offsets.add(end);
1071      if (annotsForOffset.containsKey(start)) {
1072        ((List) annotsForOffset.get(start)).add(annot);
1073      } else {
1074        List newList = new ArrayList(10);
1075        newList.add(annot);
1076        annotsForOffset.put(start, newList);
1077      }
1078      if (annotsForOffset.containsKey(end)) {
1079        ((List) annotsForOffset.get(end)).add(annot);
1080      } else {
1081        List newList = new ArrayList(10);
1082        newList.add(annot);
1083        annotsForOffset.put(end, newList);
1084      }
1085    }// End while
1086
1087    // ofsets is sorted in ascending order.
1088    // Iterate this set in descending order and remove an offset at each
1089    // iteration
1090    Iterator offsetIt = offsets.iterator();
1091    Long offset;
1092    List annotations;
1093    // This don't have to be a large buffer - just for tags
1094    StringBuffer tmpBuff = new StringBuffer(255);
1095    Stack stack = new Stack();
1096    while (offsetIt.hasNext()){
1097      offset = (Long)offsetIt.next();
1098      // Now, use it.
1099      // Returns a list with annotations that needs to be serialized in that
1100      // offset.
1101      annotations = (List) annotsForOffset.get(offset);
1102      // order annotations in list for offset to print tags in correct order
1103      annotations = getAnnotationsForOffset(annotations, offset);
1104      // clear structures
1105      tmpBuff.setLength(0);
1106      stack.clear();
1107
1108      // Iterate through all these annotations and serialize them
1109      Iterator it = annotations.iterator();
1110      Annotation a;
1111      Annotation annStack;
1112      while(it.hasNext()){
1113        a = (Annotation) it.next();
1114        // Test if a Ends at offset
1115        if ( offset.equals(a.getEndNode().getOffset()) ){
1116          // Test if a Starts at offset
1117          if ( offset.equals(a.getStartNode().getOffset()) ){
1118            // Here, the annotation a Starts and Ends at the offset
1119            if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1120                 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1121
1122              // Assert: annotation a with start == end and isEmptyAndSpan
1123              tmpBuff.append(writeStartTag(a, includeFeatures));
1124              stack.push(a);
1125            }else{
1126              // Assert annotation a with start == end and an empty tag
1127              tmpBuff.append(writeEmptyTag(a));
1128              // The annotation is removed from dumped set
1129              aDumpAnnotList.remove(a);
1130            }// End if
1131          }else{
1132            // Here the annotation a Ends at the offset.
1133            // In this case empty the stack and write the end tag
1134            if (!stack.isEmpty()){
1135              while(!stack.isEmpty()){
1136                annStack = (Annotation)stack.pop();
1137                tmpBuff.append(writeEndTag(annStack));
1138              }// End while
1139            }// End if
1140            tmpBuff.append(writeEndTag(a));
1141          }// End if
1142        }else{
1143          // The annotation a does NOT end at the offset. Let's see if it starts
1144          // at the offset
1145          if ( offset.equals(a.getStartNode().getOffset()) ){
1146            // The annotation a starts at the offset.
1147            // In this case empty the stack and write the end tag
1148            if (!stack.isEmpty()){
1149              while(!stack.isEmpty()){
1150                annStack = (Annotation)stack.pop();
1151                tmpBuff.append(writeEndTag(annStack));
1152              }// End while
1153            }// End if
1154            tmpBuff.append(writeStartTag(a, includeFeatures));
1155            // The annotation is removed from dumped set
1156          }// End if ( offset.equals(a.getStartNode().getOffset()) )
1157        }// End if ( offset.equals(a.getEndNode().getOffset()) )
1158      }// End while(it.hasNext()){
1159
1160      // In this case empty the stack and write the end tag
1161      if (!stack.isEmpty()){
1162        while(!stack.isEmpty()){
1163          annStack = (Annotation)stack.pop();
1164          tmpBuff.append(writeEndTag(annStack));
1165        }// End while
1166      }// End if
1167
1168      // extract text from content and replace spec chars
1169      StringBuffer partText = new StringBuffer();
1170      SortedMap offsetsInRange =
1171          offsets2CharsMap.subMap(lastOffset, offset);
1172      Long tmpOffset;
1173      Long tmpLastOffset = lastOffset;
1174      String replacement;
1175
1176      // Before inserting tmpBuff into the buffer we need to check
1177      // if there are chars to be replaced in range
1178      if(!offsetsInRange.isEmpty()) {
1179        tmpOffset = (Long) offsetsInRange.firstKey();
1180        replacement =
1181            (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1182        partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1183                                               tmpOffset.intValue()));
1184        partText.append(replacement);
1185        tmpLastOffset = new Long(tmpOffset.longValue()+1);
1186      }
1187      partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1188                                               offset.intValue()));
1189      resultStrBuff.append(partText);
1190      // Insert tmpBuff to the result string
1191      resultStrBuff.append(tmpBuff.toString());
1192      lastOffset = offset;
1193    }// End while(!offsets.isEmpty())
1194
1195    // get text to the end of content
1196    // extract text from content and replace spec chars
1197    StringBuffer partText = new StringBuffer();
1198    SortedMap offsetsInRange =
1199        offsets2CharsMap.subMap(lastOffset, new Long(docContStrBuff.length()));
1200    Long tmpOffset;
1201    Long tmpLastOffset = lastOffset;
1202    String replacement;
1203
1204    // Need to replace the entities in the remaining text, if there is any text
1205    // So, if there are any more items in offsets2CharsMap for remaining text
1206    // they need to be replaced
1207    if(!offsetsInRange.isEmpty()) {
1208      tmpOffset = (Long) offsetsInRange.firstKey();
1209      replacement =
1210          (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1211      partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1212                                             tmpOffset.intValue()));
1213      partText.append(replacement);
1214      tmpLastOffset = new Long(tmpOffset.longValue()+1);
1215    }
1216    partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1217                                             docContStrBuff.length()));
1218    resultStrBuff.append(partText);
1219
1220    return resultStrBuff.toString();
1221  }// saveAnnotationSetAsXml()
1222
1223/* Old method created by Cristian. Create content backward.
1224
1225    private String saveAnnotationSetAsXml(List aDumpAnnotList,
1226                                          boolean includeFeatures){
1227      String content = null;
1228      if (this.getContent()== null)
1229        content = new String("");
1230      else
1231        content = this.getContent().toString();
1232      StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1233      if (aDumpAnnotList == null)   return docContStrBuff.toString();
1234
1235      TreeMap offsets2CharsMap = new TreeMap();
1236      HashMap annotsForOffset = new HashMap(100);
1237      if (this.getContent().size().longValue() != 0){
1238        // Fill the offsets2CharsMap with all the indices where
1239        // special chars appear
1240        buildEntityMapFromString(content,offsets2CharsMap);
1241      }//End if
1242      // The saving alghorithm is as follows:
1243      ///////////////////////////////////////////
1244      // Construct a set of annot with all IDs in asc order.
1245      // All annotations that end at that offset swap their place in descending
1246      // order. For each node write all the tags from left to right.
1247
1248      // Construct the node set
1249      TreeSet offsets = new TreeSet();
1250      Iterator iter = aDumpAnnotList.iterator();
1251      while (iter.hasNext()){
1252        Annotation annot = (Annotation) iter.next();
1253        offsets.add(annot.getStartNode().getOffset());
1254        offsets.add(annot.getEndNode().getOffset());
1255        if (annotsForOffset.containsKey(annot.getStartNode().getOffset())) {
1256          ((List) annotsForOffset.get(annot.getStartNode().getOffset())).add(annot);
1257        } else {
1258          List newList = new ArrayList(10);
1259          newList.add(annot);
1260          annotsForOffset.put(annot.getStartNode().getOffset(), newList);
1261        }
1262        if (annotsForOffset.containsKey(annot.getEndNode().getOffset())) {
1263          ((List) annotsForOffset.get(annot.getEndNode().getOffset())).add(annot);
1264        } else {
1265          List newList = new ArrayList(10);
1266          newList.add(annot);
1267          annotsForOffset.put(annot.getEndNode().getOffset(), newList);
1268        }
1269      }// End while
1270
1271      // ofsets is sorted in ascending order.
1272      // Iterate this set in descending order and remove an offset at each
1273      // iteration
1274      while (!offsets.isEmpty()){
1275        Long offset = (Long)offsets.last();
1276        // Remove the offset from the set
1277        offsets.remove(offset);
1278        // Now, use it.
1279        // Returns a list with annotations that needs to be serialized in that
1280        // offset.
1281//      List annotations = getAnnotationsForOffset(aDumpAnnotList,offset);
1282        List annotations = (List) annotsForOffset.get(offset);
1283        annotations = getAnnotationsForOffset(annotations,offset);
1284        // Attention: the annotation are serialized from left to right
1285//      StringBuffer tmpBuff = new StringBuffer("");
1286        StringBuffer tmpBuff = new StringBuffer(
1287            DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1288        Stack stack = new Stack();
1289        // Iterate through all these annotations and serialize them
1290        Iterator it = annotations.iterator();
1291        while(it.hasNext()){
1292          Annotation a = (Annotation) it.next();
1293          it.remove();
1294          // Test if a Ends at offset
1295          if ( offset.equals(a.getEndNode().getOffset()) ){
1296            // Test if a Starts at offset
1297            if ( offset.equals(a.getStartNode().getOffset()) ){
1298              // Here, the annotation a Starts and Ends at the offset
1299              if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1300                   "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1301
1302                // Assert: annotation a with start == end and isEmptyAndSpan
1303                tmpBuff.append(writeStartTag(a, includeFeatures));
1304                stack.push(a);
1305              }else{
1306                // Assert annotation a with start == end and an empty tag
1307                tmpBuff.append(writeEmptyTag(a));
1308                // The annotation is removed from dumped set
1309                aDumpAnnotList.remove(a);
1310              }// End if
1311            }else{
1312              // Here the annotation a Ends at the offset.
1313              // In this case empty the stack and write the end tag
1314              if (!stack.isEmpty()){
1315                while(!stack.isEmpty()){
1316                  Annotation a1 = (Annotation)stack.pop();
1317                  tmpBuff.append(writeEndTag(a1));
1318                }// End while
1319              }// End if
1320              tmpBuff.append(writeEndTag(a));
1321            }// End if
1322          }else{
1323            // The annotation a does NOT end at the offset. Let's see if it starts
1324            // at the offset
1325            if ( offset.equals(a.getStartNode().getOffset()) ){
1326              // The annotation a starts at the offset.
1327              // In this case empty the stack and write the end tag
1328              if (!stack.isEmpty()){
1329                while(!stack.isEmpty()){
1330                  Annotation a1 = (Annotation)stack.pop();
1331                  tmpBuff.append(writeEndTag(a1));
1332                }// End while
1333              }// End if
1334              tmpBuff.append(writeStartTag(a, includeFeatures));
1335              // The annotation is removed from dumped set
1336              aDumpAnnotList.remove(a);
1337            }// End if ( offset.equals(a.getStartNode().getOffset()) )
1338          }// End if ( offset.equals(a.getEndNode().getOffset()) )
1339        }// End while(it.hasNext()){
1340
1341        // In this case empty the stack and write the end tag
1342        if (!stack.isEmpty()){
1343          while(!stack.isEmpty()){
1344            Annotation a1 = (Annotation)stack.pop();
1345            tmpBuff.append(writeEndTag(a1));
1346          }// End while
1347        }// End if
1348
1349        // Before inserting tmpBuff into docContStrBuff we need to check
1350        // if there are chars to be replaced and if there are, they would be
1351        // replaced.
1352        if (!offsets2CharsMap.isEmpty()){
1353          Long offsChar = (Long) offsets2CharsMap.lastKey();
1354          while( !offsets2CharsMap.isEmpty() &&
1355                         offsChar.intValue() >= offset.intValue()){
1356            // Replace the char at offsChar with its corresponding entity form
1357            // the entitiesMap.
1358            docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1359            (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1360            // Discard the offsChar after it was used.
1361            offsets2CharsMap.remove(offsChar);
1362            // Investigate next offsChar
1363            if (!offsets2CharsMap.isEmpty())
1364              offsChar = (Long) offsets2CharsMap.lastKey();
1365          }// End while
1366        }// End if
1367        // Insert tmpBuff to the location where it belongs in docContStrBuff
1368        docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1369      }// End while(!offsets.isEmpty())
1370      // Need to replace the entities in the remaining text, if there is any text
1371      // So, if there are any more items in offsets2CharsMap they need to be
1372      // replaced
1373      while (!offsets2CharsMap.isEmpty()){
1374        Long offsChar = (Long) offsets2CharsMap.lastKey();
1375        // Replace the char with its entity
1376        docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1377        (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1378        // remove the offset from the map
1379        offsets2CharsMap.remove(offsChar);
1380      }// End while
1381      return docContStrBuff.toString();
1382    }// saveAnnotationSetAsXml()
1383*/
1384
1385  /**
1386   *  Return true only if the document has features for original content and
1387   *  repositioning information.
1388   */
1389  private boolean hasOriginalContentFeatures() {
1390    FeatureMap features = getFeatures();
1391    boolean result = false;
1392
1393    result =
1394    (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
1395      &&
1396    (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME)
1397      != null);
1398
1399    return result;
1400  } // hasOriginalContentFeatures
1401
1402  /** This method saves all the annotations from aDumpAnnotSet and combines
1403    * them with the original document content, if preserved as feature.
1404    * @param aSourceAnnotationSet is a GATE annotation set prepared to be used
1405    * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
1406    * then an empty string will be returned.
1407    * @param includeFeatures is a boolean, which controls whether the annotation
1408    * features and gate ID are included or not.
1409    * @return The XML document obtained from raw text + the information from
1410    * the dump annotation set.
1411    */
1412  private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
1413                                        boolean includeFeatures){
1414    StringBuffer docContStrBuff;
1415
1416    String origContent;
1417
1418    origContent =
1419     (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
1420    if(origContent == null) {
1421      origContent = "";
1422    } // if
1423
1424    long originalContentSize = origContent.length();
1425
1426    RepositioningInfo repositioning = (RepositioningInfo)
1427      getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
1428
1429    docContStrBuff = new StringBuffer(origContent);
1430    if (aSourceAnnotationSet == null) return docContStrBuff.toString();
1431
1432    StatusListener sListener = (StatusListener)
1433                               gate.gui.MainFrame.getListeners().
1434                               get("gate.event.StatusListener");
1435
1436    AnnotationSet originalMarkupsAnnotSet =
1437            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1438    // Create a dumping annotation set on the document. It will be used for
1439    // dumping annotations...
1440    AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
1441    if(sListener != null)
1442      sListener.statusChanged("Constructing the dumping annotation set.");
1443    // Then take all the annotations from aSourceAnnotationSet and verify if
1444    // they can be inserted safely into the dumpingSet. Where not possible,
1445    // report.
1446    if (aSourceAnnotationSet != null){
1447      Iterator iter = aSourceAnnotationSet.iterator();
1448      Annotation currentAnnot;
1449      while (iter.hasNext()){
1450        currentAnnot = (Annotation) iter.next();
1451        if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1452            && insertsSafety(dumpingSet, currentAnnot)){
1453          dumpingSet.add(currentAnnot);
1454        }else{
1455          Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
1456          ", startOffset=" + currentAnnot.getStartNode().getOffset() +
1457          ", endOffset=" + currentAnnot.getEndNode().getOffset() +
1458          ", type=" + currentAnnot.getType()+ " was found to violate the" +
1459          " crossed over condition. It will be discarded");
1460        }// End if
1461      }// End while
1462    }// End if
1463
1464    // The dumpingSet is ready to be exported as XML
1465    // Here we go.
1466    if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
1467
1468    ///////////////////////////////////////////
1469    // Construct a set of annot with all IDs in asc order.
1470    // All annotations that end at that offset swap their place in descending
1471    // order. For each node write all the tags from left to right.
1472
1473    // Construct the node set
1474    TreeSet offsets = new TreeSet();
1475    Iterator iter = aSourceAnnotationSet.iterator();
1476    while (iter.hasNext()){
1477      Annotation annot = (Annotation) iter.next();
1478      offsets.add(annot.getStartNode().getOffset());
1479      offsets.add(annot.getEndNode().getOffset());
1480    }// End while
1481
1482    // ofsets is sorted in ascending order.
1483    // Iterate this set in descending order and remove an offset at each
1484    // iteration
1485    while (!offsets.isEmpty()){
1486      Long offset = (Long)offsets.last();
1487      // Remove the offset from the set
1488      offsets.remove(offset);
1489      // Now, use it.
1490      // Returns a list with annotations that needs to be serialized in that
1491      // offset.
1492      List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset);
1493      // Attention: the annotation are serialized from left to right
1494      StringBuffer tmpBuff = new StringBuffer("");
1495      Stack stack = new Stack();
1496      // Iterate through all these annotations and serialize them
1497      Iterator it = annotations.iterator();
1498      Annotation a = null;
1499      while(it.hasNext()) {
1500        a = (Annotation) it.next();
1501        it.remove();
1502        // Test if a Ends at offset
1503        if ( offset.equals(a.getEndNode().getOffset()) ){
1504          // Test if a Starts at offset
1505          if ( offset.equals(a.getStartNode().getOffset()) ){
1506            // Here, the annotation a Starts and Ends at the offset
1507            if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1508                 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1509
1510              // Assert: annotation a with start == end and isEmptyAndSpan
1511              tmpBuff.append(writeStartTag(a, includeFeatures, false));
1512              stack.push(a);
1513            }else{
1514              // Assert annotation a with start == end and an empty tag
1515              tmpBuff.append(writeEmptyTag(a, false));
1516              // The annotation is removed from dumped set
1517              aSourceAnnotationSet.remove(a);
1518            }// End if
1519          }else{
1520            // Here the annotation a Ends at the offset.
1521            // In this case empty the stack and write the end tag
1522            while(!stack.isEmpty()){
1523              Annotation a1 = (Annotation)stack.pop();
1524              tmpBuff.append(writeEndTag(a1));
1525            }// End while
1526            tmpBuff.append(writeEndTag(a));
1527          }// End if
1528        }else{
1529          // The annotation a does NOT end at the offset. Let's see if it starts
1530          // at the offset
1531          if ( offset.equals(a.getStartNode().getOffset()) ){
1532            // The annotation a starts at the offset.
1533            // In this case empty the stack and write the end tag
1534            while(!stack.isEmpty()){
1535              Annotation a1 = (Annotation)stack.pop();
1536              tmpBuff.append(writeEndTag(a1));
1537            }// End while
1538
1539            tmpBuff.append(writeStartTag(a, includeFeatures, false));
1540            // The annotation is removed from dumped set
1541            aSourceAnnotationSet.remove(a);
1542          }// End if ( offset.equals(a.getStartNode().getOffset()) )
1543        }// End if ( offset.equals(a.getEndNode().getOffset()) )
1544      }// End while(it.hasNext()){
1545
1546      // In this case empty the stack and write the end tag
1547      while(!stack.isEmpty()){
1548        Annotation a1 = (Annotation)stack.pop();
1549        tmpBuff.append(writeEndTag(a1));
1550      }// End while
1551
1552      long originalPosition = -1;
1553      boolean backPositioning =
1554        a != null && offset.equals(a.getEndNode().getOffset());
1555      if ( backPositioning ) {
1556        // end of the annotation correction
1557        originalPosition =
1558          repositioning.getOriginalPos(offset.intValue(), true);
1559      } // if
1560
1561      if(originalPosition == -1) {
1562        originalPosition = repositioning.getOriginalPos(offset.intValue());
1563      } // if
1564
1565      // Insert tmpBuff to the location where it belongs in docContStrBuff
1566      if(originalPosition != -1 && originalPosition <= originalContentSize ) {
1567        docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
1568      }
1569      else {
1570        Out.prln("Error in the repositioning. The offset ("+offset.intValue()
1571        +") could not be positioned in the original document. \n"
1572        +"Calculated position is: "+originalPosition
1573        +" placed back: "+backPositioning);
1574      } // if
1575
1576    }// End while(!offsets.isEmpty())
1577    if (theRootAnnotation != null)
1578      docContStrBuff.append(writeEndTag(theRootAnnotation));
1579    return docContStrBuff.toString();
1580  } // saveAnnotationSetAsXmlInOrig()
1581
1582  /** This method returns a list with annotations ordered that way that
1583    * they can be serialized from left to right, at the offset. If one of the
1584    * params is null then an empty list will be returned.
1585    * @param aDumpAnnotSet is a set containing all annotations that will be
1586    * dumped.
1587    * @param offset represent the offset at witch the annotation must start
1588    * AND/OR end.
1589    * @return a list with those annotations that need to be serialized.
1590    */
1591  private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){
1592    List annotationList = new LinkedList();
1593    if (aDumpAnnotSet == null || offset == null) return annotationList;
1594    Set annotThatStartAtOffset = new TreeSet(
1595                          new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
1596    Set annotThatEndAtOffset = new TreeSet(
1597                          new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
1598    Set annotThatStartAndEndAtOffset = new TreeSet(
1599                          new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
1600
1601    // Fill these tree lists with annotation tat start, end or start and
1602    // end at the offset.
1603    Iterator iter = aDumpAnnotSet.iterator();
1604    while(iter.hasNext()){
1605      Annotation ann = (Annotation) iter.next();
1606      if (offset.equals(ann.getStartNode().getOffset())){
1607        if (offset.equals(ann.getEndNode().getOffset()))
1608          annotThatStartAndEndAtOffset.add(ann);
1609        else
1610          annotThatStartAtOffset.add(ann);
1611      }else{
1612        if (offset.equals(ann.getEndNode().getOffset()))
1613          annotThatEndAtOffset.add(ann);
1614      }// End if
1615    }// End while
1616    annotationList.addAll(annotThatEndAtOffset);
1617    annotThatEndAtOffset = null;
1618    annotationList.addAll(annotThatStartAtOffset);
1619    annotThatStartAtOffset = null;
1620    iter = annotThatStartAndEndAtOffset.iterator();
1621    while(iter.hasNext()){
1622      Annotation ann = (Annotation) iter.next();
1623      Iterator it = annotationList.iterator();
1624      boolean breaked = false;
1625      while (it.hasNext()){
1626        Annotation annFromList = (Annotation) it.next();
1627        if (annFromList.getId().intValue() > ann.getId().intValue()){
1628          annotationList.add(annotationList.indexOf(annFromList),ann);
1629          breaked = true;
1630          break;
1631        }// End if
1632      }// End while
1633      if (!breaked)
1634        annotationList.add(ann);
1635      iter.remove();
1636    }// End while
1637    return annotationList;
1638  }// getAnnotationsForOffset()
1639
1640  private List getAnnotationsForOffset(List aDumpAnnotList, Long offset){
1641    List annotationList = new ArrayList();
1642    if (aDumpAnnotList == null || offset == null) return annotationList;
1643    Set annotThatStartAtOffset;
1644    Set annotThatEndAtOffset;
1645    Set annotThatStartAndEndAtOffset;
1646    annotThatStartAtOffset = new TreeSet(
1647        new AnnotationComparator(ORDER_ON_END_OFFSET, DESC));
1648    annotThatEndAtOffset = new TreeSet(
1649        new AnnotationComparator(ORDER_ON_START_OFFSET, DESC));
1650    annotThatStartAndEndAtOffset = new TreeSet(
1651        new AnnotationComparator(ORDER_ON_ANNOT_ID, ASC));
1652
1653    // Fill these tree lists with annotation tat start, end or start and
1654    // end at the offset.
1655    Iterator iter = aDumpAnnotList.iterator();
1656    while(iter.hasNext()){
1657      Annotation ann = (Annotation) iter.next();
1658      if (offset.equals(ann.getStartNode().getOffset())){
1659        if (offset.equals(ann.getEndNode().getOffset()))
1660          annotThatStartAndEndAtOffset.add(ann);
1661        else
1662          annotThatStartAtOffset.add(ann);
1663      }else{
1664        if (offset.equals(ann.getEndNode().getOffset()))
1665          annotThatEndAtOffset.add(ann);
1666      }// End if
1667    }// End while
1668
1669    annotationList.addAll(annotThatEndAtOffset);
1670    annotationList.addAll(annotThatStartAtOffset);
1671    annotThatEndAtOffset = null;
1672    annotThatStartAtOffset = null;
1673
1674    iter = annotThatStartAndEndAtOffset.iterator();
1675    while(iter.hasNext()){
1676      Annotation ann = (Annotation) iter.next();
1677      Iterator it = annotationList.iterator();
1678      boolean breaked = false;
1679      while (it.hasNext()){
1680        Annotation annFromList = (Annotation) it.next();
1681        if (annFromList.getId().intValue() > ann.getId().intValue()){
1682          annotationList.add(annotationList.indexOf(annFromList),ann);
1683          breaked = true;
1684          break;
1685        }// End if
1686      }// End while
1687      if (!breaked)
1688        annotationList.add(ann);
1689      iter.remove();
1690    }// End while
1691    return annotationList;
1692  }// getAnnotationsForOffset()
1693
1694  private String writeStartTag(Annotation annot, boolean includeFeatures){
1695    return writeStartTag(annot, includeFeatures, true);
1696  } // writeStartTag
1697
1698  /** Returns a string representing a start tag based on the input annot*/
1699  private String writeStartTag(Annotation annot, boolean includeFeatures,
1700                                boolean includeNamespace){
1701    AnnotationSet originalMarkupsAnnotSet =
1702            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1703
1704    StringBuffer strBuff = new StringBuffer("");
1705    if (annot == null) return strBuff.toString();
1706//    if (!addGatePreserveFormatTag && isRootTag){
1707      if (theRootAnnotation != null && annot.getId().equals(theRootAnnotation.getId())){
1708      //the features are included either if desired or if that's an annotation
1709      //from the original markup of the document. We don't want for example to
1710      //spoil all links in an HTML file!
1711      if (includeFeatures) {
1712        strBuff.append("<");
1713        strBuff.append(annot.getType());
1714        strBuff.append(" ");
1715        if(includeNamespace) {
1716          strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\"");
1717          strBuff.append(" gate:");
1718        }
1719        strBuff.append("gateId=\"");
1720        strBuff.append(annot.getId());
1721        strBuff.append("\"");
1722        strBuff.append(" ");
1723        if(includeNamespace) {
1724          strBuff.append("gate:");
1725        }
1726        strBuff.append("annotMaxId=\"");
1727        strBuff.append(nextAnnotationId);
1728        strBuff.append("\"");
1729        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1730        strBuff.append(">");
1731      }
1732      else if (originalMarkupsAnnotSet.contains(annot)) {
1733          strBuff.append("<");
1734          strBuff.append(annot.getType());
1735          strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1736          strBuff.append(">");
1737        }
1738      else {
1739        strBuff.append("<");
1740        strBuff.append(annot.getType());
1741        strBuff.append(">");
1742      }
1743
1744    }else{
1745      //the features are included either if desired or if that's an annotation
1746      //from the original markup of the document. We don't want for example to
1747      //spoil all links in an HTML file!
1748      if (includeFeatures) {
1749        strBuff.append("<");
1750        strBuff.append(annot.getType());
1751        strBuff.append(" ");
1752        if(includeNamespace) {
1753          strBuff.append("gate:");
1754        } // if includeNamespaces
1755        strBuff.append("gateId=\"");
1756        strBuff.append(annot.getId());
1757        strBuff.append("\"");
1758        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1759        strBuff.append(">");
1760      }
1761      else if (originalMarkupsAnnotSet.contains(annot)) {
1762        strBuff.append("<");
1763        strBuff.append(annot.getType());
1764        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1765        strBuff.append(">");
1766      }
1767      else {
1768        strBuff.append("<");
1769        strBuff.append(annot.getType());
1770        strBuff.append(">");
1771      }
1772    }// End if
1773    return strBuff.toString();
1774  }// writeStartTag()
1775
1776  /**
1777   * Identifies the root annotations inside an annotation set.
1778   * The root annotation is the one that starts at offset 0, and has the
1779   * greatest span. If there are more than one with this function, then the
1780   * annotation with the smalled ID wil be selected as root.
1781   * If none is identified it will return null.
1782   * @param anAnnotationSet The annotation set possibly containing
1783   *  the root annotation.
1784   * @return The root annotation or null is it fails
1785   */
1786  private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet){
1787    if (anAnnotationSet == null) return null;
1788    // If the starting node of this annotation is not null, then the annotation
1789    // set will not have a root annotation.
1790    Node startNode = anAnnotationSet.firstNode();
1791    Node endNode = anAnnotationSet.lastNode();
1792    // This is placed here just to speed things up. The alghorithm bellow can
1793    // can identity the annotation that span over the entire set and with the
1794    // smallest ID. However the root annotation will have to have the start
1795    // offset equal to 0.
1796    if (startNode.getOffset().longValue() != 0) return null;
1797    // Go anf find the annotation.
1798    Annotation theRootAnnotation = null;
1799    // Check if there are annotations starting at offset 0. If there are, then
1800    // check all of them to see which one has the greatest span. Basically its
1801    // END offset should be the bigest offset from the input annotation set.
1802    long start = startNode.getOffset().longValue();
1803    long end = endNode.getOffset().longValue();
1804    for(Iterator it = anAnnotationSet.iterator(); it.hasNext();){
1805      Annotation currentAnnot = (Annotation) it.next();
1806      // If the currentAnnot has both its Start and End equals to the Start and
1807      // end of the AnnotationSet then check to see if its ID is the smallest.
1808      if (
1809          (start == currentAnnot.getStartNode().getOffset().longValue()) &&
1810          (end   == currentAnnot.getEndNode().getOffset().longValue())
1811         ){
1812          // The currentAnnotation has is a potencial root one.
1813          if (theRootAnnotation == null)
1814            theRootAnnotation = currentAnnot;
1815          else{
1816            // If its ID is greater that the currentAnnot then update the root
1817            if ( theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1818              theRootAnnotation = currentAnnot;
1819          }// End if
1820      }// End if
1821    }// End for
1822    return theRootAnnotation;
1823  }// End identifyTheRootAnnotation()
1824
1825  private Annotation identifyTheRootAnnotation(List anAnnotationList){
1826    if (anAnnotationList == null || anAnnotationList.isEmpty()) return null;
1827    // If the first annotation in the list (which is sorted by start offset)
1828    //does not have an offset = 0, then there's no root tag.
1829    if(((Annotation)anAnnotationList.get(0)).
1830       getStartNode().getOffset().longValue() > 0) return null;
1831
1832    //find the limits
1833    long start = 0; //we know this already
1834    long end = 0; //end = 0  will be improved by the next loop
1835    for(int i = 0; i < anAnnotationList.size(); i++){
1836      Annotation anAnnotation = (Annotation)anAnnotationList.get(i);
1837      long localEnd = anAnnotation.getEndNode().getOffset().longValue();
1838      if(localEnd > end) end = localEnd;
1839    }
1840
1841    // Go and find the annotation.
1842    //look at all annotations that start at 0 and end at end
1843    //if there are several, choose the one with the smallest ID
1844    Annotation theRootAnnotation = null;
1845    for(int i = 0; i < anAnnotationList.size(); i++){
1846      Annotation currentAnnot = (Annotation) anAnnotationList.get(i);
1847      long localStart = currentAnnot.getStartNode().getOffset().longValue();
1848      long localEnd = currentAnnot.getEndNode().getOffset().longValue();
1849      // If the currentAnnot has both its Start and End equals to the Start and
1850      // end of the AnnotationSet then check to see if its ID is the smallest.
1851      if (
1852          (start == localStart) && (end == localEnd)){
1853          // The currentAnnotation has is a potential root one.
1854          if (theRootAnnotation == null) theRootAnnotation = currentAnnot;
1855          else{
1856            // If root's ID is greater that the currentAnnot then update the root
1857            if (theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1858              theRootAnnotation = currentAnnot;
1859          }// End if
1860      }// End if
1861    }// End for
1862    return theRootAnnotation;
1863  }// End identifyTheRootAnnotation()
1864
1865
1866  /** This method takes aScanString and searches for those chars from
1867    * entitiesMap that appear in the string. A tree map(offset2Char) is filled
1868    * using as key the offsets where those Chars appear and the Char.
1869    * If one of the params is null the method simply returns.
1870    */
1871  private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
1872    if (aScanString == null || aMapToFill == null) return;
1873    if (entitiesMap == null || entitiesMap.isEmpty()){
1874      Err.prln("WARNING: Entities map was not initialised !");
1875      return;
1876    }// End if
1877    // Fill the Map with the offsets of the special chars
1878    Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
1879    Character c;
1880    int fromIndex;
1881    while(entitiesMapIterator.hasNext()){
1882      c = (Character) entitiesMapIterator.next();
1883      fromIndex = 0;
1884      while (-1 != fromIndex){
1885        fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
1886        if (-1 != fromIndex){
1887          aMapToFill.put(new Long(fromIndex),c);
1888          fromIndex ++;
1889        }// End if
1890      }// End while
1891    }// End while
1892  }//buildEntityMapFromString();
1893
1894  private String writeEmptyTag(Annotation annot){
1895    return writeEmptyTag(annot, true);
1896  } // writeEmptyTag
1897
1898  /** Returns a string representing an empty tag based on the input annot*/
1899  private String writeEmptyTag(Annotation annot, boolean includeNamespace){
1900    StringBuffer strBuff = new StringBuffer("");
1901    if (annot == null) return strBuff.toString();
1902
1903    strBuff.append("<");
1904    strBuff.append(annot.getType());
1905
1906    AnnotationSet originalMarkupsAnnotSet =
1907            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1908    if (! originalMarkupsAnnotSet.contains(annot)) {
1909      strBuff.append(" gateId=\"");
1910      strBuff.append(annot.getId());
1911      strBuff.append("\"");
1912    }
1913    strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace));
1914    strBuff.append("/>");
1915
1916    return strBuff.toString();
1917  }// writeEmptyTag()
1918
1919  /** Returns a string representing an end tag based on the input annot*/
1920  private String writeEndTag(Annotation annot){
1921    StringBuffer strBuff = new StringBuffer("");
1922    if (annot == null) return strBuff.toString();
1923/*
1924    if (annot.getType().indexOf(" ") != -1)
1925      Out.prln("Warning: Truncating end tag to first word for annot type \""
1926      +annot.getType()+ "\". ");
1927*/
1928    strBuff.append("</"+annot.getType()+">");
1929
1930    return strBuff.toString();
1931  }// writeEndTag()
1932
1933  /** Returns a string representing a FeatureMap serialized as XML attributes*/
1934  private String writeFeatures(FeatureMap feat, boolean includeNamespace){
1935    StringBuffer strBuff = new StringBuffer("");
1936    if (feat == null) return strBuff.toString();
1937    Iterator it = feat.keySet().iterator();
1938    while (it.hasNext()){
1939      Object key = it.next();
1940      Object value = feat.get(key);
1941      if ( (key != null) && (value != null) ){
1942        // Eliminate a feature inserted at reading time and which help to
1943        // take some decissions at saving time
1944        if ("isEmptyAndSpan".equals(key.toString()))
1945          continue;
1946        if( !(String.class.isAssignableFrom(key.getClass()) ||
1947              Number.class.isAssignableFrom(key.getClass()))){
1948
1949            Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
1950                             " from String or Number.(feature discarded)");
1951            continue;
1952        }// End if
1953        if ( !(String.class.isAssignableFrom(value.getClass()) ||
1954               Number.class.isAssignableFrom(value.getClass()) ||
1955               java.util.Collection.class.isAssignableFrom(value.getClass()))){
1956
1957            Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
1958                       " from String, Number or Collection.(feature discarded)");
1959            continue;
1960        }// End if
1961        if ("matches".equals(key)) {
1962          strBuff.append(" ");
1963          if(includeNamespace) {
1964            strBuff.append("gate:");
1965          }
1966//          strBuff.append(key);
1967          // replace non XML chars in attribute name
1968          strBuff.append(
1969            filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1970          strBuff.append("=\"");
1971        }
1972        else {
1973          strBuff.append(" ");
1974//          strBuff.append(key);
1975          // replace non XML chars in attribute name
1976          strBuff.append(
1977            filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1978          strBuff.append("=\"");
1979        }
1980        if (java.util.Collection.class.isAssignableFrom(value.getClass())){
1981          Iterator valueIter = ((Collection)value).iterator();
1982          while(valueIter.hasNext()){
1983            Object item = valueIter.next();
1984            if (!(String.class.isAssignableFrom(item.getClass()) ||
1985                  Number.class.isAssignableFrom(item.getClass())))
1986                  continue;
1987//            strBuff.append(item);
1988            // replace non XML chars in collection item
1989            strBuff.append(
1990              filterNonXmlChars(replaceCharsWithEntities(item.toString())));
1991            strBuff.append(";");
1992          }// End while
1993          if (strBuff.charAt(strBuff.length()-1) == ';')
1994            strBuff.deleteCharAt(strBuff.length()-1);
1995        }else{
1996//          strBuff.append(value);
1997          // replace non XML chars in attribute value
1998          strBuff.append(
1999            filterNonXmlChars(replaceCharsWithEntities(value.toString())));
2000        }// End if
2001        strBuff.append("\"");
2002      }// End if
2003    }// End while
2004    return strBuff.toString();
2005  }// writeFeatures()
2006
2007  /** Returns a GateXml document that is a custom XML format for wich there is
2008    * a reader inside GATE called gate.xml.GateFormatXmlHandler.
2009    * What it does is to serialize a GATE document in an XML format.
2010    * @return a string representing a Gate Xml document.
2011    */
2012  public String toXml(){
2013    // Initialize the xmlContent with 3 time the size of the current document.
2014    // This is because of the tags size. This measure is made to increase the
2015    // performance of StringBuffer.
2016    StringBuffer xmlContent = new StringBuffer(
2017         DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
2018    // Add xml header
2019    xmlContent.append("<?xml version=\"1.0\" encoding=\"");
2020    xmlContent.append(getEncoding());
2021    xmlContent.append("\" ?>");
2022    xmlContent.append(Strings.getNl());
2023
2024    // Add the root element
2025    xmlContent.append("<GateDocument>\n");
2026    xmlContent.append("<!-- The document's features-->\n\n");
2027    xmlContent.append("<GateDocumentFeatures>\n");
2028
2029    xmlContent.append(featuresToXml(this.getFeatures()));
2030    xmlContent.append("</GateDocumentFeatures>\n");
2031    xmlContent.append("<!-- The document content area with serialized"+
2032                      " nodes -->\n\n");
2033    // Add plain text element
2034    xmlContent.append("<TextWithNodes>");
2035    xmlContent.append(textWithNodes(this.getContent().toString()));
2036    xmlContent.append("</TextWithNodes>\n");
2037    // Serialize as XML all document's annotation sets
2038    // Serialize the default AnnotationSet
2039    StatusListener sListener = (StatusListener)
2040                               gate.gui.MainFrame.getListeners().
2041                               get("gate.event.StatusListener");
2042    if(sListener != null)
2043      sListener.statusChanged("Saving the default annotation set ");
2044    xmlContent.append("<!-- The default annotation set -->\n\n");
2045    xmlContent.append(annotationSetToXml(this.getAnnotations()));
2046    // Serialize all others AnnotationSets
2047    // namedAnnotSets is a Map containing all other named Annotation Sets.
2048    if (namedAnnotSets != null){
2049      Iterator iter = namedAnnotSets.values().iterator();
2050      while(iter.hasNext()){
2051        AnnotationSet annotSet = (AnnotationSet) iter.next();
2052        xmlContent.append("<!-- Named annotation set -->\n\n");
2053        // Serialize it as XML
2054        if(sListener != null) sListener.statusChanged("Saving " +
2055                                                      annotSet.getName()+
2056                                                      " annotation set ");
2057        xmlContent.append(annotationSetToXml(annotSet));
2058      }// End while
2059    }// End if
2060    // Add the end of GateDocument
2061    xmlContent.append("</GateDocument>");
2062    if(sListener != null) sListener.statusChanged("Done !");
2063    // return the XmlGateDocument
2064    return xmlContent.toString();
2065  }// toXml
2066
2067  /** This method filters any non XML char
2068    * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets
2069    * All non XML chars will be replaced with 0x20 (space char) This assures
2070    * that the next time the document is loaded there won't be any problems.
2071    * @param aStrBuffer represents the input String that is filtred. If the
2072    * aStrBuffer is null then an empty string will be returend
2073    * @return the "purified" StringBuffer version of the aStrBuffer
2074    */
2075  private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
2076    if (aStrBuffer == null) return new StringBuffer("");
2077//    String space = new String(" ");
2078    char space = ' ';
2079    for (int i=aStrBuffer.length()-1;i>=0; i--){
2080      if (!isXmlChar(aStrBuffer.charAt(i)))
2081        aStrBuffer.setCharAt(i, space);
2082    }// End for
2083    return aStrBuffer;
2084  }// filterNonXmlChars()
2085
2086  /** This method decide if a char is a valid XML one or not
2087    * @param ch the char to be tested
2088    * @return true if is a valid XML char and fals if is not.
2089    */
2090  public static boolean isXmlChar(char ch){
2091    if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
2092    if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
2093    if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
2094    if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
2095    return false;
2096  }// End isXmlChar()
2097
2098  /** This method saves a FeatureMap as XML elements.
2099    * @param aFeatureMap the feature map that has to be saved as XML.
2100    * @return a String like this: <Feature><Name>...</Name>
2101    * <Value>...</Value></Feature><Feature>...</Feature>
2102    */
2103  private String featuresToXml(FeatureMap aFeatureMap){
2104    StringBuffer str = new StringBuffer("");
2105
2106    if (aFeatureMap == null) return str.toString();
2107
2108    Set keySet = aFeatureMap.keySet();
2109    Iterator keyIterator = keySet.iterator();
2110    while(keyIterator.hasNext()){
2111      Object key = keyIterator.next();
2112      Object value = aFeatureMap.get(key);
2113      if ((key != null) && (value != null)){
2114        String keyClassName = null;
2115        String keyItemClassName = null;
2116        String valueClassName = null;
2117        String valueItemClassName = null;
2118        String key2String = key.toString();
2119        String value2String = value.toString();
2120
2121        Object item = null;
2122        // Test key if it is String, Number or Collection
2123        if (key instanceof java.lang.String ||
2124            key instanceof java.lang.Number ||
2125            key instanceof java.util.Collection)
2126          keyClassName = key.getClass().getName();
2127
2128        // Test value if it is String, Number or Collection
2129        if (value instanceof java.lang.String ||
2130            value instanceof java.lang.Number ||
2131            value instanceof java.util.Collection)
2132          valueClassName = value.getClass().getName();
2133
2134        // Features and values that are not Strings, Numbers or collections
2135        // will be discarded.
2136        if (keyClassName == null || valueClassName == null) continue;
2137
2138        // If key is collection serialize the colection in a specific format
2139        if (key instanceof java.util.Collection){
2140          StringBuffer keyStrBuff = new StringBuffer("");
2141          Iterator iter = ((Collection) key).iterator();
2142          if (iter.hasNext()){
2143            item = iter.next();
2144            if (item instanceof java.lang.Number)
2145              keyItemClassName = item.getClass().getName();
2146            else
2147              keyItemClassName = String.class.getName();
2148            keyStrBuff.append(item.toString());
2149          }// End if
2150          while (iter.hasNext()){
2151            item = iter.next();
2152            keyStrBuff.append(";" + item.toString());
2153          }// End while
2154          key2String = keyStrBuff.toString();
2155        }// End if
2156        // If key is collection serialize the colection in a specific format
2157        if (value instanceof java.util.Collection){
2158          StringBuffer valueStrBuff = new StringBuffer("");
2159          Iterator iter = ((Collection) value).iterator();
2160          if (iter.hasNext()){
2161            item = iter.next();
2162            if (item instanceof java.lang.Number)
2163              valueItemClassName = item.getClass().getName();
2164            else
2165              valueItemClassName = String.class.getName();
2166            valueStrBuff.append(item.toString());
2167          }// End if
2168          while (iter.hasNext()){
2169            item = iter.next();
2170            valueStrBuff.append(";" + item.toString());
2171          }// End while
2172          value2String = valueStrBuff.toString();
2173        }// End if
2174        str.append("<Feature>\n  <Name");
2175        if (keyClassName != null)
2176          str.append(" className=\""+keyClassName+"\"");
2177        if (keyItemClassName != null)
2178          str.append(" itemClassName=\""+keyItemClassName+"\"");
2179        str.append(">");
2180        str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
2181        str.append("</Name>\n  <Value");
2182        if (valueClassName != null)
2183          str.append(" className=\"" + valueClassName + "\"");
2184        if (valueItemClassName != null)
2185          str.append(" itemClassName=\"" + valueItemClassName + "\"");
2186        str.append(">");
2187        str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
2188        str.append("</Value>\n</Feature>\n");
2189      }// End if
2190    }// end While
2191    return str.toString();
2192  }//featuresToXml
2193
2194  /** This method replace all chars that appears in the anInputString and also
2195    * that are in the entitiesMap with their corresponding entity
2196    * @param anInputString the string analyzed. If it is null then returns the
2197    *  empty string
2198    * @return a string representing the input string with chars replaced with
2199    *  entities
2200    */
2201  private StringBuffer replaceCharsWithEntities(String anInputString){
2202    if (anInputString == null) return new StringBuffer("");
2203    StringBuffer strBuff = new StringBuffer(anInputString);
2204    for (int i=strBuff.length()-1; i>=0; i--){
2205      Character ch = new Character(strBuff.charAt(i));
2206      if (entitiesMap.keySet().contains(ch)){
2207        strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
2208      }// End if
2209    }// End for
2210    return strBuff;
2211  }//replaceCharsWithEntities()
2212
2213  /** This method creates Node XML elements and inserts them at the
2214    * corresponding offset inside the text. Nodes are created from the default
2215    * annotation set, as well as from all existing named annotation sets.
2216    * @param aText The text representing the document's plain text.
2217    * @return The text with empty <Node id="NodeId"/> elements.
2218    */
2219  private String textWithNodes(String aText){
2220    if (aText == null) return new String("");
2221    StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
2222
2223    // Construct a map from offsets to Chars
2224    TreeMap offsets2CharsMap = new TreeMap();
2225    if (aText.length()!= 0){
2226      // Fill the offsets2CharsMap with all the indices where special chars appear
2227      buildEntityMapFromString(aText,offsets2CharsMap);
2228    }//End if
2229    // Construct the offsetsSet for all nodes belonging to this document
2230    TreeSet offsetsSet = new TreeSet();
2231    Iterator annotSetIter = this.getAnnotations().iterator();
2232    while (annotSetIter.hasNext()){
2233      Annotation annot = (Annotation) annotSetIter.next();
2234      offsetsSet.add(annot.getStartNode().getOffset());
2235      offsetsSet.add(annot.getEndNode().getOffset());
2236    }// end While
2237    // Get the nodes from all other named annotation sets.
2238    if (namedAnnotSets != null){
2239      Iterator iter = namedAnnotSets.values().iterator();
2240      while(iter.hasNext()){
2241        AnnotationSet annotSet = (AnnotationSet) iter.next();
2242        Iterator iter2 = annotSet.iterator();
2243        while(iter2.hasNext()){
2244          Annotation annotTmp = (Annotation) iter2.next();
2245          offsetsSet.add(annotTmp.getStartNode().getOffset());
2246          offsetsSet.add(annotTmp.getEndNode().getOffset());
2247        }// End while
2248      }// End while
2249    }// End if
2250    // offsetsSet is ordered in ascending order because the structure
2251    // is a TreeSet
2252
2253    if (offsetsSet.isEmpty()){
2254      return replaceCharsWithEntities(aText).toString();
2255    }// End if
2256    // Iterate through all nodes from anAnnotSet and transform them to
2257    // XML elements. Then insert those elements at the node's offset into the
2258    // textWithNodes .
2259    while (!offsetsSet.isEmpty()){
2260      Long offset = (Long) offsetsSet.last();
2261      // Eliminate the offset from the list in order to create more memory space
2262      offsetsSet.remove(offset);
2263      // Use offset
2264      int offsetValue = offset.intValue();
2265      String strNode = "<Node id=\"" + offsetValue + "\"/>";
2266      // Before inserting this string into the textWithNodes, check to see if
2267      // there are any chars to be replaced with their corresponding entities
2268      if (!offsets2CharsMap.isEmpty()){
2269        Long offsChar = (Long) offsets2CharsMap.lastKey();
2270        while( !offsets2CharsMap.isEmpty() &&
2271                       offsChar.intValue() >= offset.intValue()){
2272          // Replace the char at offsChar with its corresponding entity form
2273          // the entitiesMap.
2274          textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2275          (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2276          // Discard the offsChar after it was used because this offset will
2277          // never appear again
2278          offsets2CharsMap.remove(offsChar);
2279          // Investigate next offsChar
2280          if (!offsets2CharsMap.isEmpty())
2281            offsChar = (Long) offsets2CharsMap.lastKey();
2282        }// End while
2283      }// End if
2284      // Now it is safe to insert the node
2285      textWithNodes.insert(offsetValue,strNode);
2286    }// end while
2287    // Need to replace the entities in the remaining text, if there is any text
2288    // So, if there are any more items in offsets2CharsMap they need to be
2289    // replaced
2290    while (!offsets2CharsMap.isEmpty()){
2291      Long offsChar = (Long) offsets2CharsMap.lastKey();
2292      // Replace the char with its entity
2293      textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2294      (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2295      // remove the offset from the map
2296      offsets2CharsMap.remove(offsChar);
2297    }// End while
2298    return textWithNodes.toString();
2299  }//textWithNodes()
2300
2301  /** This method saves an AnnotationSet as XML.
2302    * @param anAnnotationSet The annotation set that has to be saved as XML.
2303    * @return a String like this: <AnnotationSet> <Annotation>....
2304    * </AnnotationSet>
2305    */
2306  private String annotationSetToXml(AnnotationSet anAnnotationSet){
2307    StringBuffer str = new StringBuffer("");
2308
2309    if (anAnnotationSet == null){
2310      str.append("<AnnotationSet>\n");
2311      str.append("</AnnotationSet>\n");
2312      return str.toString();
2313    }// End if
2314    if (anAnnotationSet.getName() == null)
2315      str.append("<AnnotationSet>\n");
2316    else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
2317                                                                    "\" >\n");
2318    // Iterate through AnnotationSet and save each Annotation as XML
2319    Iterator iterator = anAnnotationSet.iterator();
2320    while (iterator.hasNext()){
2321      Annotation annot = (Annotation) iterator.next();
2322      str.append("<Annotation " + "Type=\"" + annot.getType() +
2323                  "\" StartNode=\"" + annot.getStartNode().getOffset() +
2324                   "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
2325      str.append(featuresToXml(annot.getFeatures()));
2326      str.append("</Annotation>\n");
2327    }// End while
2328
2329    str.append("</AnnotationSet>\n");
2330    return str.toString();
2331  }// annotationSetToXml
2332
2333  /** Returns a map with the named annotation sets. It returns <code>null</code>
2334   *  if no named annotaton set exists. */
2335  public Map getNamedAnnotationSets() {
2336    return namedAnnotSets;
2337  } // getNamedAnnotationSets
2338
2339  /** Returns a set of all named annotation sets in existence
2340  */
2341  public Set getAnnotationSetNames(){
2342    return namedAnnotSets.keySet();
2343  }
2344
2345
2346  /**
2347   * Removes one of the named annotation sets.
2348   * Note that the default annotation set cannot be removed.
2349   * @param name the name of the annotation set to be removed
2350   */
2351  public void removeAnnotationSet(String name){
2352    Object removed = namedAnnotSets.remove(name);
2353    if(removed != null){
2354      fireAnnotationSetRemoved(
2355        new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
2356    }
2357  }
2358
2359  /** Propagate edit changes to the document content and annotations. */
2360  public void edit(Long start, Long end, DocumentContent replacement)
2361    throws InvalidOffsetException
2362  {
2363    if(! isValidOffsetRange(start, end))
2364      throw new InvalidOffsetException();
2365
2366    if(content != null)
2367      ((DocumentContentImpl) content).edit(start, end, replacement);
2368
2369    if(defaultAnnots != null)
2370      ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
2371
2372    if(namedAnnotSets != null) {
2373      Iterator iter = namedAnnotSets.values().iterator();
2374      while(iter.hasNext())
2375        ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
2376    }
2377    //let the listeners know
2378    fireContentEdited(new DocumentEvent(this, DocumentEvent.CONTENT_EDITED,
2379            start, end));
2380  } // edit(start,end,replacement)
2381
2382  /** Check that an offset is valid, i.e. it is non-null, greater than
2383    * or equal to 0 and less than the size of the document content.
2384    */
2385  public boolean isValidOffset(Long offset) {
2386    if(offset == null)
2387      return false;
2388
2389    long o = offset.longValue();
2390    if(o > getContent().size().longValue() || o < 0)
2391      return false;
2392
2393    return true;
2394  } // isValidOffset
2395
2396  /** Check that both start and end are valid offsets and that
2397    * they constitute a valid offset range, i.e. start is greater
2398    * than or equal to long.
2399    */
2400  public boolean isValidOffsetRange(Long start, Long end) {
2401    return
2402      isValidOffset(start) && isValidOffset(end) &&
2403      start.longValue() <= end.longValue();
2404  } // isValidOffsetRange(start,end)
2405
2406  /** Sets the nextAnnotationId */
2407  public void setNextAnnotationId(int aNextAnnotationId){
2408    nextAnnotationId = aNextAnnotationId;
2409  }// setNextAnnotationId();
2410
2411  /** Generate and return the next annotation ID */
2412  public Integer getNextAnnotationId() {
2413    return new Integer(nextAnnotationId++);
2414  } // getNextAnnotationId
2415
2416  /** Generate and return the next node ID */
2417  public Integer getNextNodeId() { return new Integer(nextNodeId++); }
2418
2419  /** Ordering based on URL.toString() and the URL offsets (if any) */
2420  public int compareTo(Object o) throws ClassCastException {
2421    DocumentImpl other = (DocumentImpl) o;
2422    return getOrderingString().compareTo(other.getOrderingString());
2423  } // compareTo
2424
2425  /** Utility method to produce a string for comparison in ordering.
2426    * String is based on the source URL and offsets.
2427    */
2428  protected String getOrderingString() {
2429    if(sourceUrl == null) return toString();
2430
2431    StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
2432    if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
2433      orderingString.append(sourceUrlStartOffset.toString());
2434      orderingString.append(sourceUrlEndOffset.toString());
2435    }
2436
2437    return orderingString.toString();
2438  } // getOrderingString()
2439
2440  /** The id of the next new annotation */
2441  protected int nextAnnotationId = 0;
2442
2443  /** The id of the next new node */
2444  protected int nextNodeId = 0;
2445  /** The source URL */
2446  protected URL sourceUrl;
2447
2448  /** The document's URL name. */
2449
2450  /** The content of the document */
2451  protected DocumentContent content;
2452
2453  /** The encoding of the source of the document content */
2454  protected String encoding = null;
2455
2456  // Data needed in toXml(AnnotationSet) methos
2457
2458  /** This field indicates whether or not to add the tag
2459    * called GatePreserveFormat to the document. HTML, XML, SGML docs won't
2460    * have this tag added
2461    */
2462//  private boolean addGatePreserveFormatTag = false;
2463
2464  /**
2465   * Used by the XML dump preserving format method
2466   */
2467  private Annotation theRootAnnotation = null;
2468
2469  /** This field is used when creating StringBuffers for toXml() methods.
2470    * The size of the StringBuffer will be docDonctent.size() multiplied by this
2471    * value. It is aimed to improve the performance of StringBuffer
2472    */
2473  private final int DOC_SIZE_MULTIPLICATION_FACTOR = 2;
2474
2475  /** Constant used in the inner class AnnotationComparator to order
2476    * annotations on their start offset
2477    */
2478  private final int ORDER_ON_START_OFFSET = 0;
2479  /** Constant used in the inner class AnnotationComparator to order
2480    * annotations on their end offset
2481    */
2482  private final int ORDER_ON_END_OFFSET = 1;
2483  /** Constant used in the inner class AnnotationComparator to order
2484    * annotations on their ID
2485    */
2486  private final int ORDER_ON_ANNOT_ID = 2;
2487  /** Constant used in the inner class AnnotationComparator to order
2488    * annotations ascending
2489    */
2490  private final int ASC = 3;
2491  /** Constant used in the inner class AnnotationComparator to order
2492    * annotations descending
2493    */
2494  private final int DESC = -3;
2495
2496  /** A map initialized in init() containing entities that needs to be
2497    * replaced in strings
2498    */
2499  private static Map entitiesMap = null;
2500  // Initialize the entities map use when saving as xml
2501  static{
2502    entitiesMap = new HashMap();
2503    entitiesMap.put(new Character('<'),"&lt;");
2504    entitiesMap.put(new Character('>'),"&gt;");
2505    entitiesMap.put(new Character('&'),"&amp;");
2506    entitiesMap.put(new Character('\''),"&apos;");
2507    entitiesMap.put(new Character('"'),"&quot;");
2508    entitiesMap.put(new Character((char)160),"&#160;");
2509    entitiesMap.put(new Character((char)169),"&#169;");
2510  }//static
2511
2512  /** The range that the content comes from at the source URL
2513    * (or null if none).
2514    */
2515  //protected Long[] sourceUrlOffsets;
2516
2517  /** The start of the range that the content comes from at the source URL
2518    * (or null if none).
2519    */
2520  protected Long sourceUrlStartOffset;
2521
2522  /** The end of the range that the content comes from at the source URL
2523    * (or null if none).
2524    */
2525  protected Long sourceUrlEndOffset;
2526
2527  /** The default annotation set */
2528  protected AnnotationSet defaultAnnots;
2529
2530  /** Named sets of annotations */
2531  protected Map namedAnnotSets;
2532
2533  /**
2534   * A property of the document that will be set when the user
2535   * wants to create the document from a string, as opposed to from
2536   * a URL.
2537   */
2538  private String stringContent;
2539
2540  /**
2541   * The stringContent of a document is
2542   * a property of the document that will be set when the user
2543   * wants to create the document from a string, as opposed to from
2544   * a URL.
2545   * <B>Use the <TT>getContent</TT> method instead to get the actual document
2546   * content.</B>
2547   */
2548  public String getStringContent() { return stringContent; }
2549
2550  /**
2551   * The stringContent of a document is
2552   * a property of the document that will be set when the user
2553   * wants to create the document from a string, as opposed to from
2554   * a URL.
2555   * <B>Use the <TT>setContent</TT> method instead to update the actual
2556   * document content.</B>
2557   */
2558  public void setStringContent(String stringContent) {
2559    this.stringContent = stringContent;
2560  } // set StringContent
2561
2562  /** Is the document markup-aware? */
2563  protected Boolean markupAware = new Boolean(false);
2564//  /** Hash code */
2565//  public int hashCode() {
2566//    int code = getContent().hashCode();
2567//    int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
2568//    code += memberCode;
2569//    memberCode = (encoding == null) ? 0 : encoding.hashCode();
2570//    code += memberCode;
2571//    memberCode = (features == null) ? 0 : features.hashCode();
2572//    code += memberCode;
2573//    code += (markupAware.booleanValue()) ? 0 : 1;
2574//    memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
2575//    code += memberCode;
2576//    code += nextAnnotationId;
2577//    code += nextNodeId;
2578//    memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
2579//    code += memberCode;
2580//    memberCode =
2581//      (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
2582//    code += memberCode;
2583//    memberCode =
2584//      (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
2585//    code += memberCode;
2586//    return code;
2587//  } // hashcode
2588
2589  /** String respresentation */
2590  public String toString() {
2591    String n = Strings.getNl();
2592    StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2593    s.append("  content:" + content + n);
2594    s.append("  defaultAnnots:" + defaultAnnots + n);
2595    s.append("  encoding:" + encoding + n);
2596    s.append("  features:" + features + n);
2597    s.append("  markupAware:" + markupAware + n);
2598    s.append("  namedAnnotSets:" + namedAnnotSets + n);
2599    s.append("  nextAnnotationId:" + nextAnnotationId + n);
2600    s.append("  nextNodeId:" + nextNodeId + n);
2601    s.append("  sourceUrl:" + sourceUrl + n);
2602    s.append("  sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2603    s.append("  sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2604    s.append(n);
2605
2606    return s.toString();
2607  } // toString
2608
2609   /** Freeze the serialization UID. */
2610  static final long serialVersionUID = -8456893608311510260L;
2611
2612  /** Inner class needed to compare annotations*/
2613  class AnnotationComparator implements java.util.Comparator {
2614    int orderOn = -1;
2615    int orderType = ASC;
2616    /** Constructs a comparator according to one of three sorter types:
2617      * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
2618      */
2619      public AnnotationComparator(int anOrderOn, int anOrderType){
2620        orderOn = anOrderOn;
2621        orderType = anOrderType;
2622      }// AnnotationComparator()
2623
2624      /**This method must be implemented according to Comparator interface */
2625      public int compare(Object o1, Object o2){
2626        Annotation a1 = (Annotation) o1;
2627        Annotation a2 = (Annotation) o2;
2628        // ORDER_ON_START_OFFSET ?
2629        if (orderOn == ORDER_ON_START_OFFSET){
2630          int result = a1.getStartNode().getOffset().compareTo(
2631                                                a2.getStartNode().getOffset());
2632          if (orderType == ASC){
2633            // ASC
2634            // If they are equal then their ID will decide.
2635            if (result == 0)
2636              return a1.getId().compareTo(a2.getId());
2637            return result;
2638          }else{
2639            // DESC
2640            if (result == 0)
2641              return - (a1.getId().compareTo(a2.getId()));
2642            return -result;
2643          }// End if (orderType == ASC)
2644        }// End if (orderOn == ORDER_ON_START_OFFSET)
2645
2646        // ORDER_ON_END_OFFSET ?
2647        if (orderOn == ORDER_ON_END_OFFSET){
2648          int result = a1.getEndNode().getOffset().compareTo(
2649                                                a2.getEndNode().getOffset());
2650          if (orderType == ASC){
2651            // ASC
2652            // If they are equal then their ID will decide.
2653            if (result == 0)
2654              return - (a1.getId().compareTo(a2.getId()));
2655            return result;
2656          }else{
2657            // DESC
2658            // If they are equal then their ID will decide.
2659            if (result == 0)
2660              return a1.getId().compareTo(a2.getId());
2661            return - result;
2662          }// End if (orderType == ASC)
2663        }// End if (orderOn == ORDER_ON_END_OFFSET)
2664
2665        // ORDER_ON_ANNOT_ID ?
2666        if (orderOn == ORDER_ON_ANNOT_ID){
2667          if (orderType == ASC)
2668            return a1.getId().compareTo(a2.getId());
2669          else
2670            return -(a1.getId().compareTo(a2.getId()));
2671        }// End if
2672        return 0;
2673      }//compare()
2674  } // End inner class AnnotationComparator
2675
2676
2677  private transient Vector documentListeners;
2678  private transient Vector gateListeners;
2679
2680  public synchronized void removeDocumentListener(DocumentListener l) {
2681    if (documentListeners != null && documentListeners.contains(l)) {
2682      Vector v = (Vector) documentListeners.clone();
2683      v.removeElement(l);
2684      documentListeners = v;
2685    }
2686  }
2687  public synchronized void addDocumentListener(DocumentListener l) {
2688    Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
2689    if (!v.contains(l)) {
2690      v.addElement(l);
2691      documentListeners = v;
2692    }
2693  }
2694
2695  protected void fireAnnotationSetAdded(DocumentEvent e) {
2696    if (documentListeners != null) {
2697      Vector listeners = documentListeners;
2698      int count = listeners.size();
2699      for (int i = 0; i < count; i++) {
2700        ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
2701      }
2702    }
2703  }
2704
2705  protected void fireAnnotationSetRemoved(DocumentEvent e) {
2706    if (documentListeners != null) {
2707      Vector listeners = documentListeners;
2708      int count = listeners.size();
2709      for (int i = 0; i < count; i++) {
2710        ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
2711      }
2712    }
2713  }
2714
2715  protected void fireContentEdited(DocumentEvent e) {
2716    if (documentListeners != null) {
2717      Vector listeners = documentListeners;
2718      int count = listeners.size();
2719      for (int i = 0; i < count; i++) {
2720        ((DocumentListener) listeners.elementAt(i)).contentEdited(e);
2721      }
2722    }
2723  }
2724
2725  public void resourceLoaded(CreoleEvent e) {
2726  }
2727  public void resourceUnloaded(CreoleEvent e) {
2728  }
2729  public void datastoreOpened(CreoleEvent e) {
2730  }
2731  public void datastoreCreated(CreoleEvent e) {
2732  }
2733  public void resourceRenamed(Resource resource, String oldName,
2734                              String newName){
2735  }
2736  public void datastoreClosed(CreoleEvent e) {
2737    if (! e.getDatastore().equals(this.getDataStore()))
2738      return;
2739    //close this lr, since it cannot stay open when the DS it comes from
2740    //is closed
2741    Factory.deleteResource(this);
2742  }
2743  public void setLRPersistenceId(Object lrID) {
2744    super.setLRPersistenceId( lrID);
2745    //make persistent documents listen to the creole register
2746    //for events about their DS
2747    Gate.getCreoleRegister().addCreoleListener(this);
2748  }
2749  public void resourceAdopted(DatastoreEvent evt) {
2750  }
2751  public void resourceDeleted(DatastoreEvent evt) {
2752    if(! evt.getSource().equals(this.getDataStore()))
2753      return;
2754    //if an open document is deleted from a DS, then
2755    //it must close itself immediately, as is no longer valid
2756    if(evt.getResourceID().equals(this.getLRPersistenceId()))
2757      Factory.deleteResource(this);
2758  }
2759  public void resourceWritten(DatastoreEvent evt) {
2760  }
2761  public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException {
2762    super.setDataStore( dataStore);
2763    if (this.dataStore != null)
2764      this.dataStore.addDatastoreListener(this);
2765  }
2766
2767  /**
2768   * This method added by Shafirin Andrey, to allow access to
2769   * protected member {@link #defaultAnnots}
2770   * Required for JAPE-Debugger.
2771   * */
2772  public void setDefaultAnnotations(AnnotationSet defaultAnnotations) {
2773      defaultAnnots = defaultAnnotations;
2774  }
2775
2776} // class DocumentImpl
2777