1   /*
2    *  DocumentImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentImpl.java,v 1.115 2002/07/12 13:24:28 valyt Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  
22  import gate.*;
23  import gate.annotation.*;
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.gui.*;
27  import gate.event.*;
28  
29  /** Represents the commonalities between all sorts of documents.
30    *
31    * <H2>Editing</H2>
32    *
33    * <P>
34    * The DocumentImpl class implements the Document interface.
35    * The DocumentContentImpl class models the textual or audio-visual
36    * materials which are the source and content of Documents.
37    * The AnnotationSetImpl class supplies annotations on Documents.
38    *
39    * <P>
40    * Abbreviations:
41    *
42    * <UL>
43    * <LI>
44    * DC = DocumentContent
45    * <LI>
46    * D = Document
47    * <LI>
48    * AS = AnnotationSet
49    * </UL>
50    *
51    * <P>
52    * We add an edit method to each of these classes; for DC and AS
53    * the methods are package private; D has the public method.
54    *
55    * <PRE>
56    *   void edit(Long start, Long end, DocumentContent replacement)
57    *   throws InvalidOffsetException;
58    * </PRE>
59    *
60    * <P>
61    * D receives edit requests and forwards them to DC and AS.
62    * On DC, this method makes a change to the content - e.g. replacing
63    * a String range from start to end with replacement. (Deletions
64    * are catered for by having replacement = null.) D then calls
65    * AS.edit on each of its annotation sets.
66    *
67    * <P>
68    * On AS, edit calls replacement.size() (i.e. DC.size()) to
69    * figure out how long the replacement is (0 for null). It then
70    * considers annotations that terminate (start or end) in
71    * the altered or deleted range as invalid; annotations that
72    * terminate after the range have their offsets adjusted.
73    * I.e.:
74    * <UL>
75    * <LI>
76    * the nodes that pointed inside the old modified area are invalid now and
77    * will be deleted along with the connected annotations;
78    * <LI>
79    * the nodes that are before the start of the modified area remain
80    * untouched;
81    * <LI>
82    * the nodes that are after the end of the affected area will have the
83    * offset changed according to the formula below.
84    * </UL>
85    *
86    * <P>
87    * A note re. AS and annotations: annotations no longer have
88    * offsets as in the old model, they now have nodes, and nodes
89    * have offsets.
90    *
91    * <P>
92    * To implement AS.edit, we have several indices:
93    * <PRE>
94    *   HashMap annotsByStartNode, annotsByEndNode;
95    * </PRE>
96    * which map node ids to annotations;
97    * <PRE>
98    *   RBTreeMap nodesByOffset;
99    * </PRE>
100   * which maps offset to Nodes.
101   *
102   * <P>
103   * When we get an edit request, we traverse that part of the
104   * nodesByOffset tree representing the altered or deleted
105   * range of the DC. For each node found, we delete any annotations
106   * that terminate on the node, and then delete the node itself.
107   * We then traverse the rest of the tree, changing the offset
108   * on all remaining nodes by:
109   * <PRE>
110   *   newOffset =
111   *     oldOffset -
112   *     (
113   *       (end - start) -                                     // size of mod
114   *       ( (replacement == null) ? 0 : replacement.size() )  // size of repl
115   *     );
116   * </PRE>
117   * Note that we use the same convention as e.g. java.lang.String: start
118   * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd"
119   * range 1-3 = "bc". Examples, for a node with offset 4:
120   * <PRE>
121   * edit(1, 3, "BC");
122   * newOffset = 4 - ( (3 - 1) - 2 ) = 4
123   *
124   * edit(1, 3, null);
125   * newOffset = 4 - ( (3 - 1) - 0 ) = 2
126   *
127   * edit(1, 3, "BBCC");
128   * newOffset = 4 - ( (3 - 1) - 4 ) = 6
129   * </PRE>
130   */
131 public class DocumentImpl
132 extends AbstractLanguageResource implements TextualDocument, CreoleListener,
133                                             DatastoreListener {
134   /** Debug flag */
135   private static final boolean DEBUG = false;
136 
137   /** If you set this flag to true the original content of the document will
138    *  be kept in the document feature. <br>
139    *  Default value is false to avoid the unnecessary waste of memory */
140   private Boolean preserveOriginalContent = new Boolean(false);
141 
142   /** If you set this flag to true the repositioning information for
143    *  the document will be kept in the document feature. <br>
144    *  Default value is false to avoid the unnecessary waste of time and memory
145    */
146   private Boolean collectRepositioningInfo = new Boolean(false);
147 
148   /**
149    * This is a variable which contains the latest crossed over annotation
150    * found during export with preserving format, i.e., toXml(annotations)
151    * method.
152    */
153   private Annotation crossedOverAnnotation = null;
154 
155   /** Default construction. Content left empty. */
156   public DocumentImpl() {
157     content = new DocumentContentImpl();
158   } // default construction
159 
160   /** Initialise this resource, and return it. */
161   public Resource init() throws ResourceInstantiationException {
162     //make sure we have an encoding
163     if(encoding == null || encoding.length() == 0)
164       encoding = System.getProperty("file.encoding");
165     if(encoding == null || encoding.length() == 0) encoding = "UTF-8";
166 
167     // set up the source URL and create the content
168     if(sourceUrl == null) {
169       if(stringContent == null) {
170         throw new ResourceInstantiationException(
171           "The sourceURL and document's content were null."
172         );
173       }
174 
175       content = new DocumentContentImpl(stringContent);
176       getFeatures().put("gate.SourceURL", "created from String");
177     } else {
178       try {
179         content = new DocumentContentImpl(
180           sourceUrl, encoding, sourceUrlStartOffset, sourceUrlEndOffset);
181         getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
182       } catch(IOException e) {
183         e.printStackTrace();
184 //        throw new ResourceInstantiationException("DocumentImpl.init: " + e);
185       }
186 
187       if(preserveOriginalContent.booleanValue() && content != null) {
188         String originalContent = new String(
189           ((DocumentContentImpl) content).getOriginalContent());
190         getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
191                       originalContent);
192       } // if
193     }
194 
195     // set up a DocumentFormat if markup unpacking required
196     if(getMarkupAware().booleanValue()) {
197       DocumentFormat docFormat =
198         DocumentFormat.getDocumentFormat(this, sourceUrl);
199       try {
200         if(docFormat != null){
201           StatusListener sListener = (StatusListener)
202                                       gate.gui.MainFrame.getListeners().
203                                       get("gate.event.StatusListener");
204           if(sListener != null) docFormat.addStatusListener(sListener);
205 
206           // set the flag if true and if the document format support collecting
207           docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
208 
209           if(docFormat.getShouldCollectRepositioning().booleanValue()) {
210             // unpack with collectiong of repositioning information
211             RepositioningInfo info = new RepositioningInfo();
212 
213             String origContent = (String) getFeatures().get(
214                 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
215 
216             RepositioningInfo ampCodingInfo = new RepositioningInfo();
217             if(origContent != null) {
218               boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
219               collectInformationForAmpCodding(origContent, ampCodingInfo,
220                                               shouldCorrectCR);
221               if(docFormat instanceof HtmlDocumentFormat) {
222                 collectInformationForWS(origContent, ampCodingInfo);
223               } // if
224             } // if
225 
226             docFormat.unpackMarkup(this, info, ampCodingInfo);
227 
228             if(origContent != null
229                 && docFormat instanceof XmlDocumentFormat) {
230               // CRLF correction of RepositioningInfo
231               correctRepositioningForCRLFInXML(origContent, info);
232             } // if
233 
234             getFeatures().put(
235                 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
236           }
237           else {
238             // normal old fashioned unpack
239             docFormat.unpackMarkup(this);
240           }
241           docFormat.removeStatusListener(sListener);
242        } //if format != null
243       } catch(DocumentFormatException e) {
244         throw new ResourceInstantiationException(
245           "Couldn't unpack markup in document " + sourceUrl.toExternalForm() +
246           " " + e
247         );
248       }
249     } // if markup aware
250 
251 //try{
252 //  FileWriter fw = new FileWriter("d:/temp/doccontent.txt");
253 //  fw.write(getContent().toString());
254 //  fw.flush();
255 //  fw.close();
256 //}catch(IOException ioe){
257 //  ioe.printStackTrace();
258 //}
259 
260     return this;
261   } // init()
262 
263   /**
264    * Correct repositioning information for substitution of "\r\n" with "\n"
265    */
266   private void correctRepositioningForCRLFInXML(String content,
267                                             RepositioningInfo info) {
268     int index = -1;
269 
270     do {
271       index = content.indexOf("\r\n", index+1);
272       if(index != -1) {
273         info.correctInformationOriginalMove(index, 1);
274       } // if
275     } while(index != -1);
276   } // correctRepositioningForCRLF
277 
278   /**
279    * Collect information for substitution of "&xxx;" with "y"
280    *
281    * It couldn't be collected a position information about
282    * some unicode and &-coded symbols during parsing. The parser "hide" the
283    * information about the position of such kind of parsed text.
284    * So, there is minimal chance to have &-coded symbol inside the covered by
285    * repositioning records area. The new record should be created for every
286    * coded symbol outside the existing records.
287    * <BR>
288    * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction
289    * for CRLF substitution is performed.
290    */
291   private void collectInformationForAmpCodding(String content,
292                                             RepositioningInfo info,
293                                             boolean shouldCorrectCR) {
294 
295     if(content == null || info == null) return;
296 
297     int ampIndex = -1;
298     int semiIndex;
299 
300     do {
301       ampIndex = content.indexOf('&', ampIndex+1);
302       if(ampIndex != -1) {
303         semiIndex = content.indexOf(';', ampIndex+1);
304         // have semicolon and it is near enough for amp codding
305         if(semiIndex != -1 && (semiIndex-ampIndex) < 8) {
306           info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1);
307         }
308         else {
309           // no semicolon or it is too far
310           // analyse for amp codding without semicolon
311           int maxEnd = Math.min(ampIndex+8, content.length());
312           String ampCandidate = content.substring(ampIndex, maxEnd);
313           int ampCodingSize = analyseAmpCodding(ampCandidate);
314 
315           if(ampCodingSize != -1) {
316             info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
317           } // if
318 
319         } // if - semicolon found
320       } // if - ampersand found
321     } while (ampIndex != -1);
322 
323     // correct the collected information to adjust it's positions
324     // with reported by the parser
325     int index = -1;
326 
327     if(shouldCorrectCR) {
328       do {
329         index = content.indexOf("\r\n", index+1);
330         if(index != -1) {
331           info.correctInformationOriginalMove(index, -1);
332         } // if
333       } while(index != -1);
334     } // if
335   } // collectInformationForAmpCodding
336 
337   /**
338    * This function compute size of the ampersand codded sequence when
339    * semicolin is not present.
340    */
341   private int analyseAmpCodding(String content) {
342     int result = -1;
343 
344     try {
345       char ch = content.charAt(1);
346 
347       switch(ch) {
348         case 'l' : // &lt
349         case 'L' : // &lt
350           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
351             result = 3;
352           } // if
353           break;
354         case 'g' : // &gt
355         case 'G' : // &gt
356           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
357             result = 3;
358           } // if
359           break;
360         case 'a' : // &amp
361         case 'A' : // &amp
362           if(content.substring(2, 4).equalsIgnoreCase("mp")) {
363             result = 4;
364           } // if
365           break;
366         case 'q' : // &quot
367         case 'Q' : // &quot
368           if(content.substring(2, 5).equalsIgnoreCase("uot")) {
369             result = 5;
370           } // if
371           break;
372         case '#' : // #number (example &#145, &#x4C38)
373           int endIndex = 2;
374           boolean hexCoded = false;
375           if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
376             // Hex codding
377             ++endIndex;
378             hexCoded = true;
379           } // if
380 
381           while (endIndex < 8
382                   && isNumber(content.charAt(endIndex), hexCoded) ) {
383             ++endIndex;
384           } // while
385           result = endIndex;
386           break;
387       } // switch
388     } catch (StringIndexOutOfBoundsException ex) {
389       // do nothing
390     } // catch
391 
392     return result;
393   } // analyseAmpCodding
394 
395   /** Check for numeric range. If hex is true the A..F range is included */
396   private boolean isNumber(char ch, boolean hex) {
397     if(ch >= '0' && ch <= '9') return true;
398 
399     if(hex) {
400       if(ch >= 'A' && ch <= 'F') return true;
401       if(ch >= 'a' && ch <= 'f') return true;
402     } // if
403 
404     return false;
405   } // isNumber
406 
407   /** HTML parser perform substitution of multiple whitespaces (WS) with
408    *  a single WS. To create correct repositioning information structure we
409    *  should keep the information for such multiple WS.
410    *  <BR>
411    *  The criteria for WS is <code>(ch <= ' ')</code>.
412    */
413   private void collectInformationForWS(String content, RepositioningInfo info) {
414 
415     if(content == null || info == null) return;
416 
417     // analyse the content and correct the repositioning information
418     char ch;
419     int startWS, endWS;
420 
421     startWS = endWS = -1;
422     int contentLength = content.length();
423 
424     for(int i=0; i<contentLength; ++i) {
425       ch = content.charAt(i);
426 
427       // is whitespace
428       if(ch <= ' ') {
429         if(startWS == -1) {
430           startWS = i;
431         } // if
432         endWS = i;
433       }
434       else {
435         if(endWS - startWS > 0) {
436           // put the repositioning information about the WS substitution
437           info.addPositionInfo(
438             (long)startWS, (long)(endWS - startWS + 1), 0, 1);
439         } // if
440         // clear positions
441         startWS = endWS = -1;
442       }// if
443     } // for
444   } // collectInformationForWS
445 
446   /** Clear all the data members of the object. */
447   public void cleanup() {
448 
449     defaultAnnots = null;
450     if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
451         namedAnnotSets.clear();
452     if (DEBUG) Out.prln("Document cleanup called");
453     if (this.lrPersistentId != null)
454       Gate.getCreoleRegister().removeCreoleListener(this);
455     if(this.getDataStore() != null)
456       this.getDataStore().removeDatastoreListener(this);
457   } // cleanup()
458 
459 
460   /** Documents are identified by URLs */
461   public URL getSourceUrl() { return sourceUrl; }
462 
463   /** Set method for the document's URL */
464   public void setSourceUrl(URL sourceUrl) {
465     this.sourceUrl = sourceUrl;
466   } // setSourceUrl
467 
468   /** Documents may be packed within files; in this case an optional pair of
469     * offsets refer to the location of the document.
470     */
471   public Long[] getSourceUrlOffsets() {
472     Long[] sourceUrlOffsets = new Long[2];
473     sourceUrlOffsets[0] = sourceUrlStartOffset;
474     sourceUrlOffsets[1] = sourceUrlEndOffset;
475     return sourceUrlOffsets;
476   } // getSourceUrlOffsets
477 
478   /**
479    * Allow/disallow preserving of the original document content.
480    * If is <B>true</B> the original content will be retrieved from
481    * the DocumentContent object and preserved as document feature.
482    */
483   public void setPreserveOriginalContent(Boolean b) {
484     preserveOriginalContent = b;
485   } // setPreserveOriginalContent
486 
487   /** Get the preserving of content status of the Document.
488    *
489    *  @return whether the Document should preserve it's original content.
490    */
491   public Boolean getPreserveOriginalContent() {
492     return preserveOriginalContent;
493   } // getPreserveOriginalContent
494 
495   /**
496    *  Allow/disallow collecting of repositioning information.
497    *  If is <B>true</B> information will be retrieved and preserved
498    *  as document feature.<BR>
499    *  Preserving of repositioning information give the possibilities
500    *  for converting of coordinates between the original document content and
501    *  extracted from the document text.
502    */
503   public void setCollectRepositioningInfo(Boolean b) {
504     collectRepositioningInfo = b;
505   } // setCollectRepositioningInfo
506 
507   /** Get the collectiong and preserving of repositioning information
508    *  for the Document. <BR>
509    *  Preserving of repositioning information give the possibilities
510    *  for converting of coordinates between the original document content and
511    *  extracted from the document text.
512    *
513    *  @return whether the Document should collect and preserve information.
514    */
515   public Boolean getCollectRepositioningInfo() {
516     return collectRepositioningInfo;
517   } // getCollectRepositioningInfo
518 
519   /** Documents may be packed within files; in this case an optional pair of
520     * offsets refer to the location of the document. This method gets the
521     * start offset.
522     */
523   public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
524 
525   /** Documents may be packed within files; in this case an optional pair of
526     * offsets refer to the location of the document. This method sets the
527     * start offset.
528     */
529   public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
530     this.sourceUrlStartOffset = sourceUrlStartOffset;
531   } // setSourceUrlStartOffset
532 
533   /** Documents may be packed within files; in this case an optional pair of
534     * offsets refer to the location of the document. This method gets the
535     * end offset.
536     */
537   public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
538 
539   /** Documents may be packed within files; in this case an optional pair of
540     * offsets refer to the location of the document. This method sets the
541     * end offset.
542     */
543   public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
544     this.sourceUrlEndOffset = sourceUrlEndOffset;
545   } // setSourceUrlStartOffset
546 
547   /** The content of the document: a String for text; MPEG for video; etc. */
548   public DocumentContent getContent() { return content; }
549 
550   /** Set method for the document content */
551   public void setContent(DocumentContent content) { this.content = content; }
552 
553   /** Get the encoding of the document content source */
554   public String getEncoding() { return encoding; }
555 
556   /** Set the encoding of the document content source */
557   public void setEncoding(String encoding) { this.encoding = encoding; }
558 
559   /** Get the default set of annotations. The set is created if it
560     * doesn't exist yet.
561     */
562   public AnnotationSet getAnnotations() {
563     if(defaultAnnots == null){
564       defaultAnnots = new AnnotationSetImpl(this);
565       fireAnnotationSetAdded(new DocumentEvent(
566            this, DocumentEvent.ANNOTATION_SET_ADDED, null));
567     }//if
568     return defaultAnnots;
569   } // getAnnotations()
570 
571   /** Get a named set of annotations. Creates a new set if one with this
572     * name doesn't exist yet.
573     * If the provided name is null then it returns the default annotation set.
574     */
575   public AnnotationSet getAnnotations(String name) {
576     if(name == null) return getAnnotations();
577     if(namedAnnotSets == null)
578       namedAnnotSets = new HashMap();
579     AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
580 
581     if(namedSet == null) {
582       namedSet = new AnnotationSetImpl(this, name);
583       namedAnnotSets.put(name, namedSet);
584 
585       DocumentEvent evt = new DocumentEvent(
586         this, DocumentEvent.ANNOTATION_SET_ADDED, name
587       );
588       fireAnnotationSetAdded(evt);
589     }
590     return namedSet;
591   } // getAnnotations(name)
592 
593   /** Make the document markup-aware. This will trigger the creation
594    *  of a DocumentFormat object at Document initialisation time; the
595    *  DocumentFormat object will unpack the markup in the Document and
596    *  add it as annotations. Documents are <B>not</B> markup-aware by default.
597    *
598    *  @param b markup awareness status.
599    */
600   public void setMarkupAware(Boolean newMarkupAware) {
601       this.markupAware = newMarkupAware;
602   }
603 
604   /** Get the markup awareness status of the Document.
605    *  <B>Documents are markup-aware by default.</B>
606    *  @return whether the Document is markup aware.
607    */
608   public Boolean getMarkupAware() { return markupAware; }
609 
610   /** Returns an XML document aming to preserve the original markups(
611     * the original markup will be in the same place and format as it was
612     * before processing the document) and include (if possible)
613     * the annotations specified in the aSourceAnnotationSet.
614     * It is equivalent to toXml(aSourceAnnotationSet, true).
615     */
616   public String toXml(Set aSourceAnnotationSet){
617     return toXml(aSourceAnnotationSet, true);
618   }
619 
620   /** Returns an XML document aming to preserve the original markups(
621     * the original markup will be in the same place and format as it was
622     * before processing the document) and include (if possible)
623     * the annotations specified in the aSourceAnnotationSet.
624     * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost
625     * if they will cause a crosed over situation.
626     * @param aSourceAnnotationSet is an annotation set containing all the
627     * annotations that will be combined with the original marup set. If the
628     * param is <code>null</code> it will only dump the original markups.
629     * @param includeFeatures is a boolean that controls whether the annotation
630     * features should be included or not. If false, only the annotation type
631     * is included in the tag.
632     * @return a string representing an XML document containing the original
633     * markup + dumped annotations form the aSourceAnnotationSet
634     */
635   public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){
636 
637     if(hasOriginalContentFeatures()) {
638       return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures);
639     } // if
640 
641     AnnotationSet originalMarkupsAnnotSet =
642             this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
643 
644     // Create a dumping annotation set on the document. It will be used for
645     // dumping annotations...
646     AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
647 
648     // This set will be constructed inside this method. If is not empty, the
649     // annotation contained will be lost.
650     if (!dumpingSet.isEmpty()){
651       Out.prln("WARNING: The dumping annotation set was not empty."+
652       "All annotation it contained were lost.");
653       dumpingSet.clear();
654     }// End if
655 
656     StatusListener sListener = (StatusListener)
657                                gate.gui.MainFrame.getListeners().
658                                get("gate.event.StatusListener");
659     // Construct the dumping set in that way that all annotations will verify
660     // the condition that there are not annotations which are crossed.
661     // First add all annotation from the original markups
662     if(sListener != null)
663       sListener.statusChanged("Constructing the dumping annotation set.");
664     dumpingSet.addAll(originalMarkupsAnnotSet);
665     // Then take all the annotations from aSourceAnnotationSet and verify if
666     // they can be inserted safely into the dumpingSet. Where not possible,
667     // report.
668     if (aSourceAnnotationSet != null){
669       Iterator iter = aSourceAnnotationSet.iterator();
670       while (iter.hasNext()){
671         Annotation currentAnnot = (Annotation) iter.next();
672         if(insertsSafety(dumpingSet,currentAnnot)){
673           dumpingSet.add(currentAnnot);
674         }else if (crossedOverAnnotation != null){
675           try {
676             Out.prln("Warning: Annotations were found to violate the " +
677             "crossed over condition: \n" +
678             "1. [" +
679             getContent().getContent(
680                            crossedOverAnnotation.getStartNode().getOffset(),
681                            crossedOverAnnotation.getEndNode().getOffset()) +
682             " (" + crossedOverAnnotation.getType() + ": " +
683             crossedOverAnnotation.getStartNode().getOffset() +
684             ";" + crossedOverAnnotation.getEndNode().getOffset() +
685             ")]\n" +
686             "2. [" +
687             getContent().getContent(
688                            currentAnnot.getStartNode().getOffset(),
689                            currentAnnot.getEndNode().getOffset()) +
690             " (" + currentAnnot.getType() + ": " +
691             currentAnnot.getStartNode().getOffset() +
692             ";" + currentAnnot.getEndNode().getOffset() +
693             ")]\nThe second one will be discarded.\n"  );
694           } catch (gate.util.InvalidOffsetException ex) {
695             throw new GateRuntimeException(ex.getMessage());
696           }
697         }// End if
698       }// End while
699     }// End if
700 
701     // The dumpingSet is ready to be exported as XML
702     // Here we go.
703     if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
704     StringBuffer xmlDoc = new StringBuffer(
705           DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
706 
707     // Add xml header if original format was xml
708     String mimeType = getFeatures() == null ?
709                       null :
710                       (String)getFeatures().get("MimeType");
711     boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
712 //    boolean needsRootTag = false;
713     if(wasXML){
714       String defaultEncoding = System.getProperty("file.encoding");
715       if(defaultEncoding == null) defaultEncoding = "UTF-8";
716       xmlDoc.append("<?xml version=\"1.0\" encoding=\"" +
717                     (encoding == null ? defaultEncoding : encoding) +
718                     "\" ?>" + Strings.getNl());
719       // Add the root start element if not already there
720 //      AnnotationSet aType = dumpingSet.get("GatePreserveFormat");
721 //      if(aType == null || aType.isEmpty()){
722 //        needsRootTag = true;
723 //        xmlDoc.append("<GatePreserveFormat " +
724 //                      "xmlns:gate=\"http://www.gate.ac.uk\" " +
725 //                      "gate:annotMaxId=\"" + getNextAnnotationId() + "\">");
726 //      }
727     }
728 
729     xmlDoc.append(saveAnnotationSetAsXml(dumpingSet, includeFeatures));
730 
731     xmlDoc.append(rootEnd);
732 //    if(wasXML && needsRootTag){
733 //      xmlDoc.append("</GatePreserveFormat>");
734 //    }
735     if(sListener != null) sListener.statusChanged("Done.");
736     return xmlDoc.toString();
737   }//End toXml()
738 
739   /** This method verifies if aSourceAnnotation can ve inserted safety into the
740     * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
741     * contition with any annotation from the aTargetAnnotSet.
742     * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation
743     * @param aSourceAnnotation the annotation to be inserted into the
744     * aTargetAnnotSet
745     * @return true if the annotation inserts safety, or false otherwise.
746     */
747   private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
748                                                 Annotation aSourceAnnotation){
749 
750     if (aTargetAnnotSet == null || aSourceAnnotation == null) {
751       this.crossedOverAnnotation = null;
752       return false;
753     }
754     if (aSourceAnnotation.getStartNode() == null ||
755         aSourceAnnotation.getStartNode().getOffset()== null) {
756       this.crossedOverAnnotation = null;
757       return false;
758     }
759     if (aSourceAnnotation.getEndNode() == null ||
760         aSourceAnnotation.getEndNode().getOffset()== null) {
761       this.crossedOverAnnotation = null;
762       return false;
763     }
764 
765     // Get the start and end offsets
766     Long start = aSourceAnnotation.getStartNode().getOffset();
767     Long end =   aSourceAnnotation.getEndNode().getOffset();
768     // Read aSourceAnnotation offsets long
769     long s2 = start.longValue();
770     long e2 = end.longValue();
771 
772     // Obtain a set with all annotations annotations that overlap
773     // totaly or partially with the interval defined by the two provided offsets
774     AnnotationSet as = aTargetAnnotSet.get(start,end);
775 
776     // Investigate all the annotations from as to see if there is one that
777     // comes in conflict with aSourceAnnotation
778     Iterator it = as.iterator();
779     while(it.hasNext()){
780       Annotation ann = (Annotation) it.next();
781       // Read ann offsets
782       long s1 = ann.getStartNode().getOffset().longValue();
783       long e1 = ann.getEndNode().getOffset().longValue();
784 
785       if (s1<s2 && s2<e1 && e1<e2) {
786         this.crossedOverAnnotation = ann;
787         return false;
788       }
789       if (s2<s1 && s1<e2 && e2<e1) {
790         this.crossedOverAnnotation = ann;
791         return false;
792       }
793     }// End while
794     return true;
795   }// insertsSafety()
796 
797   /** This method saves all the annotations from aDumpAnnotSet and combines
798     * them with the document content.
799     * @param aDumpAnnotationSet is a GATE annotation set prepared to be used
800     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
801     * then an empty string will be returned.
802     * @param includeFeatures is a boolean, which controls whether the annotation
803     * features and gate ID are included or not.
804     * @return The XML document obtained from raw text + the information from
805     * the dump annotation set.
806     */
807   private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
808                                         boolean includeFeatures){
809     String content = null;
810     if (this.getContent()== null)
811       content = new String("");
812     else
813       content = this.getContent().toString();
814     StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
815     if (aDumpAnnotSet == null)   return docContStrBuff.toString();
816 
817     TreeMap offsets2CharsMap = new TreeMap();
818     if (this.getContent().size().longValue() != 0){
819       // Fill the offsets2CharsMap with all the indices where
820       // special chars appear
821       buildEntityMapFromString(content,offsets2CharsMap);
822     }//End if
823     // The saving alghorithm is as follows:
824     ///////////////////////////////////////////
825     // Construct a set of annot with all IDs in asc order.
826     // All annotations that end at that offset swap their place in descending
827     // order. For each node write all the tags from left to right.
828 
829     // Construct the node set
830     TreeSet offsets = new TreeSet();
831     Iterator iter = aDumpAnnotSet.iterator();
832     while (iter.hasNext()){
833       Annotation annot = (Annotation) iter.next();
834       offsets.add(annot.getStartNode().getOffset());
835       offsets.add(annot.getEndNode().getOffset());
836       //compute the smallest ID
837       if(smallestAnnotationID == null ||
838          smallestAnnotationID.compareTo(annot.getId()) > 0){
839         smallestAnnotationID = annot.getId();
840       }
841     }// End while
842 
843     // ofsets is sorted in ascending order.
844     // Iterate this set in descending order and remove an offset at each
845     // iteration
846     while (!offsets.isEmpty()){
847       Long offset = (Long)offsets.last();
848       // Remove the offset from the set
849       offsets.remove(offset);
850       // Now, use it.
851       // Returns a list with annotations that needs to be serialized in that
852       // offset.
853       List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
854       // Attention: the annotation are serialized from left to right
855       StringBuffer tmpBuff = new StringBuffer("");
856       Stack stack = new Stack();
857       // Iterate through all these annotations and serialize them
858       Iterator it = annotations.iterator();
859       while(it.hasNext()){
860         Annotation a = (Annotation) it.next();
861         it.remove();
862         // Test if a Ends at offset
863         if ( offset.equals(a.getEndNode().getOffset()) ){
864           // Test if a Starts at offset
865           if ( offset.equals(a.getStartNode().getOffset()) ){
866             // Here, the annotation a Starts and Ends at the offset
867             if ( null != a.getFeatures().get("isEmptyAndSpan") &&
868                  "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
869 
870               // Assert: annotation a with start == end and isEmptyAndSpan
871               tmpBuff.append(writeStartTag(a, includeFeatures));
872               stack.push(a);
873             }else{
874               // Assert annotation a with start == end and an empty tag
875               tmpBuff.append(writeEmptyTag(a));
876               // The annotation is removed from dumped set
877               aDumpAnnotSet.remove(a);
878             }// End if
879           }else{
880             // Here the annotation a Ends at the offset.
881             // In this case empty the stack and write the end tag
882             if (!stack.isEmpty()){
883               while(!stack.isEmpty()){
884                 Annotation a1 = (Annotation)stack.pop();
885                 tmpBuff.append(writeEndTag(a1));
886               }// End while
887             }// End if
888             tmpBuff.append(writeEndTag(a));
889           }// End if
890         }else{
891           // The annotation a does NOT end at the offset. Let's see if it starts
892           // at the offset
893           if ( offset.equals(a.getStartNode().getOffset()) ){
894             // The annotation a starts at the offset.
895             // In this case empty the stack and write the end tag
896             if (!stack.isEmpty()){
897               while(!stack.isEmpty()){
898                 Annotation a1 = (Annotation)stack.pop();
899                 tmpBuff.append(writeEndTag(a1));
900               }// End while
901             }// End if
902             tmpBuff.append(writeStartTag(a, includeFeatures));
903             // The annotation is removed from dumped set
904             aDumpAnnotSet.remove(a);
905           }// End if ( offset.equals(a.getStartNode().getOffset()) )
906         }// End if ( offset.equals(a.getEndNode().getOffset()) )
907       }// End while(it.hasNext()){
908 
909       // In this case empty the stack and write the end tag
910       if (!stack.isEmpty()){
911         while(!stack.isEmpty()){
912           Annotation a1 = (Annotation)stack.pop();
913           tmpBuff.append(writeEndTag(a1));
914         }// End while
915       }// End if
916 
917       // Before inserting tmpBuff into docContStrBuff we need to check
918       // if there are chars to be replaced and if there are, they would be
919       // replaced.
920       if (!offsets2CharsMap.isEmpty()){
921         Integer offsChar = (Integer) offsets2CharsMap.lastKey();
922         while( !offsets2CharsMap.isEmpty() &&
923                        offsChar.intValue() >= offset.intValue()){
924           // Replace the char at offsChar with its corresponding entity form
925           // the entitiesMap.
926           docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
927           (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
928           // Discard the offsChar after it was used.
929           offsets2CharsMap.remove(offsChar);
930           // Investigate next offsChar
931           if (!offsets2CharsMap.isEmpty())
932             offsChar = (Integer) offsets2CharsMap.lastKey();
933         }// End while
934       }// End if
935       // Insert tmpBuff to the location where it belongs in docContStrBuff
936       docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
937     }// End while(!offsets.isEmpty())
938     // Need to replace the entities in the remaining text, if there is any text
939     // So, if there are any more items in offsets2CharsMap they need to be
940     // replaced
941     while (!offsets2CharsMap.isEmpty()){
942       Integer offsChar = (Integer) offsets2CharsMap.lastKey();
943       // Replace the char with its entity
944       docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
945       (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
946       // remove the offset from the map
947       offsets2CharsMap.remove(offsChar);
948     }// End while
949     return docContStrBuff.toString();
950   }// saveAnnotationSetAsXml()
951 
952   /**
953    *  Return true only if the document has features for original content and
954    *  repositioning information.
955    */
956   private boolean hasOriginalContentFeatures() {
957     FeatureMap features = getFeatures();
958     boolean result = false;
959 
960     result =
961     (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
962       &&
963     (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME)
964       != null);
965 
966     return result;
967   } // hasOriginalContentFeatures
968 
969   /** This method saves all the annotations from aDumpAnnotSet and combines
970     * them with the original document content, if preserved as feature.
971     * @param aDumpAnnotationSet is a GATE annotation set prepared to be used
972     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
973     * then an empty string will be returned.
974     * @param includeFeatures is a boolean, which controls whether the annotation
975     * features and gate ID are included or not.
976     * @return The XML document obtained from raw text + the information from
977     * the dump annotation set.
978     */
979   private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
980                                         boolean includeFeatures){
981     StringBuffer docContStrBuff;
982 
983     String origContent;
984 
985     origContent =
986      (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
987     if(origContent == null) {
988       origContent = "";
989     } // if
990 
991     long originalContentSize = origContent.length();
992 
993     RepositioningInfo repositioning = (RepositioningInfo)
994       getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
995 
996     docContStrBuff = new StringBuffer(origContent);
997     if (aSourceAnnotationSet == null) return docContStrBuff.toString();
998 
999     StatusListener sListener = (StatusListener)
1000                               gate.gui.MainFrame.getListeners().
1001                               get("gate.event.StatusListener");
1002
1003    AnnotationSet originalMarkupsAnnotSet =
1004            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1005    // Create a dumping annotation set on the document. It will be used for
1006    // dumping annotations...
1007    AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
1008    if(sListener != null)
1009      sListener.statusChanged("Constructing the dumping annotation set.");
1010    // Then take all the annotations from aSourceAnnotationSet and verify if
1011    // they can be inserted safely into the dumpingSet. Where not possible,
1012    // report.
1013    if (aSourceAnnotationSet != null){
1014      Iterator iter = aSourceAnnotationSet.iterator();
1015      Annotation currentAnnot;
1016      while (iter.hasNext()){
1017        currentAnnot = (Annotation) iter.next();
1018        if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1019            && insertsSafety(dumpingSet, currentAnnot)){
1020          dumpingSet.add(currentAnnot);
1021        }else{
1022          Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
1023          ", startOffset=" + currentAnnot.getStartNode().getOffset() +
1024          ", endOffset=" + currentAnnot.getEndNode().getOffset() +
1025          ", type=" + currentAnnot.getType()+ " was found to violate the" +
1026          " crossed over condition. It will be discarded");
1027        }// End if
1028      }// End while
1029    }// End if
1030
1031    // The dumpingSet is ready to be exported as XML
1032    // Here we go.
1033    if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
1034
1035    ///////////////////////////////////////////
1036    // Construct a set of annot with all IDs in asc order.
1037    // All annotations that end at that offset swap their place in descending
1038    // order. For each node write all the tags from left to right.
1039
1040    // Construct the node set
1041    TreeSet offsets = new TreeSet();
1042    Iterator iter = aSourceAnnotationSet.iterator();
1043    while (iter.hasNext()){
1044      Annotation annot = (Annotation) iter.next();
1045      offsets.add(annot.getStartNode().getOffset());
1046      offsets.add(annot.getEndNode().getOffset());
1047    }// End while
1048
1049    // ofsets is sorted in ascending order.
1050    // Iterate this set in descending order and remove an offset at each
1051    // iteration
1052    while (!offsets.isEmpty()){
1053      Long offset = (Long)offsets.last();
1054      // Remove the offset from the set
1055      offsets.remove(offset);
1056      // Now, use it.
1057      // Returns a list with annotations that needs to be serialized in that
1058      // offset.
1059      List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset);
1060      // Attention: the annotation are serialized from left to right
1061      StringBuffer tmpBuff = new StringBuffer("");
1062      Stack stack = new Stack();
1063      // Iterate through all these annotations and serialize them
1064      Iterator it = annotations.iterator();
1065      Annotation a = null;
1066      while(it.hasNext()) {
1067        a = (Annotation) it.next();
1068        it.remove();
1069        // Test if a Ends at offset
1070        if ( offset.equals(a.getEndNode().getOffset()) ){
1071          // Test if a Starts at offset
1072          if ( offset.equals(a.getStartNode().getOffset()) ){
1073            // Here, the annotation a Starts and Ends at the offset
1074            if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1075                 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1076
1077              // Assert: annotation a with start == end and isEmptyAndSpan
1078              tmpBuff.append(writeStartTag(a, includeFeatures, false));
1079              stack.push(a);
1080            }else{
1081              // Assert annotation a with start == end and an empty tag
1082              tmpBuff.append(writeEmptyTag(a, false));
1083              // The annotation is removed from dumped set
1084              aSourceAnnotationSet.remove(a);
1085            }// End if
1086          }else{
1087            // Here the annotation a Ends at the offset.
1088            // In this case empty the stack and write the end tag
1089            while(!stack.isEmpty()){
1090              Annotation a1 = (Annotation)stack.pop();
1091              tmpBuff.append(writeEndTag(a1));
1092            }// End while
1093            tmpBuff.append(writeEndTag(a));
1094          }// End if
1095        }else{
1096          // The annotation a does NOT end at the offset. Let's see if it starts
1097          // at the offset
1098          if ( offset.equals(a.getStartNode().getOffset()) ){
1099            // The annotation a starts at the offset.
1100            // In this case empty the stack and write the end tag
1101            while(!stack.isEmpty()){
1102              Annotation a1 = (Annotation)stack.pop();
1103              tmpBuff.append(writeEndTag(a1));
1104            }// End while
1105
1106            tmpBuff.append(writeStartTag(a, includeFeatures, false));
1107            // The annotation is removed from dumped set
1108            aSourceAnnotationSet.remove(a);
1109          }// End if ( offset.equals(a.getStartNode().getOffset()) )
1110        }// End if ( offset.equals(a.getEndNode().getOffset()) )
1111      }// End while(it.hasNext()){
1112
1113      // In this case empty the stack and write the end tag
1114      while(!stack.isEmpty()){
1115        Annotation a1 = (Annotation)stack.pop();
1116        tmpBuff.append(writeEndTag(a1));
1117      }// End while
1118
1119      long originalPosition = -1;
1120      boolean backPositioning =
1121        a != null && offset.equals(a.getEndNode().getOffset());
1122      if ( backPositioning ) {
1123        // end of the annotation correction
1124        originalPosition =
1125          repositioning.getOriginalPos(offset.intValue(), true);
1126      } // if
1127
1128      if(originalPosition == -1) {
1129        originalPosition = repositioning.getOriginalPos(offset.intValue());
1130      } // if
1131
1132      // Insert tmpBuff to the location where it belongs in docContStrBuff
1133      if(originalPosition != -1 && originalPosition <= originalContentSize ) {
1134        docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
1135      }
1136      else {
1137        Out.prln("Error in the repositioning. The offset ("+offset.intValue()
1138        +") could not be positioned in the original document. \n"
1139        +"Calculated position is: "+originalPosition
1140        +" placed back: "+backPositioning);
1141      } // if
1142
1143    }// End while(!offsets.isEmpty())
1144    docContStrBuff.append(rootEnd);
1145    return docContStrBuff.toString();
1146  } // saveAnnotationSetAsXml()
1147
1148  /** This method returns a list with annotations ordered that way that
1149    * they can be serialized from left to right, at the offset. If one of the
1150    * params is null then an empty list will be returned.
1151    * @param aDumpAnnotSet is a set containing all annotations that will be
1152    * dumped.
1153    * @param offset represent the offset at witch the annotation must start
1154    * AND/OR end.
1155    * @return a list with those annotations that need to be serialized.
1156    */
1157  private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){
1158    List annotationList = new LinkedList();
1159    if (aDumpAnnotSet == null || offset == null) return annotationList;
1160    Set annotThatStartAtOffset = new TreeSet(
1161                          new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
1162    Set annotThatEndAtOffset = new TreeSet(
1163                          new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
1164    Set annotThatStartAndEndAtOffset = new TreeSet(
1165                          new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
1166
1167    // Fill these tree lists with annotation tat start, end or start and
1168    // end at the offset.
1169    Iterator iter = aDumpAnnotSet.iterator();
1170    while(iter.hasNext()){
1171      Annotation ann = (Annotation) iter.next();
1172      if (offset.equals(ann.getStartNode().getOffset())){
1173        if (offset.equals(ann.getEndNode().getOffset()))
1174          annotThatStartAndEndAtOffset.add(ann);
1175        else
1176          annotThatStartAtOffset.add(ann);
1177      }else{
1178        if (offset.equals(ann.getEndNode().getOffset()))
1179          annotThatEndAtOffset.add(ann);
1180      }// End if
1181    }// End while
1182    annotationList.addAll(annotThatEndAtOffset);
1183    annotThatEndAtOffset = null;
1184    annotationList.addAll(annotThatStartAtOffset);
1185    annotThatStartAtOffset = null;
1186    iter = annotThatStartAndEndAtOffset.iterator();
1187    while(iter.hasNext()){
1188      Annotation ann = (Annotation) iter.next();
1189      Iterator it = annotationList.iterator();
1190      boolean breaked = false;
1191      while (it.hasNext()){
1192        Annotation annFromList = (Annotation) it.next();
1193        if (annFromList.getId().intValue() > ann.getId().intValue()){
1194          annotationList.add(annotationList.indexOf(annFromList),ann);
1195          breaked = true;
1196          break;
1197        }// End if
1198      }// End while
1199      if (!breaked)
1200        annotationList.add(ann);
1201      iter.remove();
1202    }// End while
1203    return annotationList;
1204  }// getAnnotationsForOffset()
1205
1206  private String writeStartTag(Annotation annot, boolean includeFeatures){
1207    return writeStartTag(annot, includeFeatures, true);
1208  } // writeStartTag
1209
1210  /** Returns a string representing a start tag based on the input annot*/
1211  private String writeStartTag(Annotation annot, boolean includeFeatures,
1212                                boolean includeNamespace){
1213    AnnotationSet originalMarkupsAnnotSet =
1214            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1215
1216    StringBuffer strBuff = new StringBuffer("");
1217    if (annot == null) return strBuff.toString();
1218//    if (!addGatePreserveFormatTag && isRootTag){
1219      if (annot.getId().equals(smallestAnnotationID)){
1220      //the features are included either if desired or if that's an annotation
1221      //from the original markup of the document. We don't want for example to
1222      //spoil all links in an HTML file!
1223      if (includeFeatures) {
1224        strBuff.append("<");
1225        strBuff.append(annot.getType());
1226        strBuff.append(" ");
1227        if(includeNamespace) {
1228          strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\"");
1229          strBuff.append(" gate:");
1230        }
1231        strBuff.append("gateId=\"");
1232        strBuff.append(annot.getId());
1233        strBuff.append("\"");
1234        strBuff.append(" ");
1235        if(includeNamespace) {
1236          strBuff.append("gate:");
1237        }
1238        strBuff.append("annotMaxId=\"");
1239        strBuff.append(nextAnnotationId);
1240        strBuff.append("\"");
1241        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1242        strBuff.append(">");
1243      }
1244      else if (originalMarkupsAnnotSet.contains(annot)) {
1245          strBuff.append("<");
1246          strBuff.append(annot.getType());
1247          strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1248          strBuff.append(">");
1249        }
1250      else {
1251        strBuff.append("<");
1252        strBuff.append(annot.getType());
1253        strBuff.append(">");
1254      }
1255
1256    }else{
1257      //the features are included either if desired or if that's an annotation
1258      //from the original markup of the document. We don't want for example to
1259      //spoil all links in an HTML file!
1260      if (includeFeatures) {
1261        strBuff.append("<");
1262        strBuff.append(annot.getType());
1263        strBuff.append(" ");
1264        if(includeNamespace) {
1265          strBuff.append("gate:");
1266        } // if includeNamespaces
1267        strBuff.append("gateId=\"");
1268        strBuff.append(annot.getId());
1269        strBuff.append("\"");
1270        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1271        strBuff.append(">");
1272      }
1273      else if (originalMarkupsAnnotSet.contains(annot)) {
1274        strBuff.append("<");
1275        strBuff.append(annot.getType());
1276        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1277        strBuff.append(">");
1278      }
1279      else {
1280        strBuff.append("<");
1281        strBuff.append(annot.getType());
1282        strBuff.append(">");
1283      }
1284    }// End if
1285    return strBuff.toString();
1286  }// writeStartTag()
1287
1288  /** This method takes aScanString and searches for those chars from
1289    * entitiesMap that appear in the string. A tree map(offset2Char) is filled
1290    * using as key the offsets where those Chars appear and the Char.
1291    * If one of the params is null the method simply returns.
1292    */
1293  private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
1294    if (aScanString == null || aMapToFill == null) return;
1295    if (entitiesMap == null || entitiesMap.isEmpty()){
1296      Err.prln("WARNING: Entities map was not initialised !");
1297      return;
1298    }// End if
1299    // Fill the Map with the offsets of the special chars
1300    Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
1301    while(entitiesMapIterator.hasNext()){
1302      Character c = (Character) entitiesMapIterator.next();
1303      int fromIndex = 0;
1304      while (-1 != fromIndex){
1305        fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
1306        if (-1 != fromIndex){
1307          aMapToFill.put(new Integer(fromIndex),c);
1308          fromIndex ++;
1309        }// End if
1310      }// End while
1311    }// End while
1312  }//buildEntityMapFromString();
1313
1314  private String writeEmptyTag(Annotation annot){
1315    return writeEmptyTag(annot, true);
1316  } // writeEmptyTag
1317
1318  /** Returns a string representing an empty tag based on the input annot*/
1319  private String writeEmptyTag(Annotation annot, boolean includeNamespace){
1320    StringBuffer strBuff = new StringBuffer("");
1321    if (annot == null) return strBuff.toString();
1322
1323    strBuff.append("<");
1324    strBuff.append(annot.getType());
1325
1326    AnnotationSet originalMarkupsAnnotSet =
1327            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1328    if (! originalMarkupsAnnotSet.contains(annot)) {
1329      strBuff.append(" gateId=\"");
1330      strBuff.append(annot.getId());
1331      strBuff.append("\"");
1332    }
1333    strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace));
1334    strBuff.append("/>");
1335
1336    return strBuff.toString();
1337  }// writeEmptyTag()
1338
1339  /** Returns a string representing an end tag based on the input annot*/
1340  private String writeEndTag(Annotation annot){
1341    StringBuffer strBuff = new StringBuffer("");
1342    if (annot == null) return strBuff.toString();
1343/*
1344    if (annot.getType().indexOf(" ") != -1)
1345      Out.prln("Warning: Truncating end tag to first word for annot type \""
1346      +annot.getType()+ "\". ");
1347*/
1348    strBuff.append("</"+annot.getType()+">");
1349
1350    //don't write the end for the root element as it will be added
1351    //automatically at the end.
1352    if(annot.getId().equals(smallestAnnotationID)){
1353      rootEnd = strBuff.toString();
1354      return "";
1355    }
1356    return strBuff.toString();
1357  }// writeEndTag()
1358
1359  /** Returns a string representing a FeatureMap serialized as XML attributes*/
1360  private String writeFeatures(FeatureMap feat, boolean includeNamespace){
1361    StringBuffer strBuff = new StringBuffer("");
1362    if (feat == null) return strBuff.toString();
1363    Iterator it = feat.keySet().iterator();
1364    while (it.hasNext()){
1365      Object key = it.next();
1366      Object value = feat.get(key);
1367      if ( (key != null) && (value != null) ){
1368        // Eliminate a feature inserted at reading time and which help to
1369        // take some decissions at saving time
1370        if ("isEmptyAndSpan".equals(key.toString()))
1371          continue;
1372        if( !(String.class.isAssignableFrom(key.getClass()) ||
1373              Number.class.isAssignableFrom(key.getClass()))){
1374
1375            Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
1376                             " from String or Number.(feature discarded)");
1377            continue;
1378        }// End if
1379        if ( !(String.class.isAssignableFrom(value.getClass()) ||
1380               Number.class.isAssignableFrom(value.getClass()) ||
1381               java.util.Collection.class.isAssignableFrom(value.getClass()))){
1382
1383            Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
1384                       " from String, Number or Collection.(feature discarded)");
1385            continue;
1386        }// End if
1387        if ("matches".equals(key)) {
1388          strBuff.append(" ");
1389          if(includeNamespace) {
1390            strBuff.append("gate:");
1391          }
1392//          strBuff.append(key);
1393          // replace non XML chars in attribute name
1394          strBuff.append(
1395            filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1396          strBuff.append("=\"");
1397        }
1398        else {
1399          strBuff.append(" ");
1400//          strBuff.append(key);
1401          // replace non XML chars in attribute name
1402          strBuff.append(
1403            filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1404          strBuff.append("=\"");
1405        }
1406        if (java.util.Collection.class.isAssignableFrom(value.getClass())){
1407          Iterator valueIter = ((Collection)value).iterator();
1408          while(valueIter.hasNext()){
1409            Object item = valueIter.next();
1410            if (!(String.class.isAssignableFrom(item.getClass()) ||
1411                  Number.class.isAssignableFrom(item.getClass())))
1412                  continue;
1413//            strBuff.append(item);
1414            // replace non XML chars in collection item
1415            strBuff.append(
1416              filterNonXmlChars(replaceCharsWithEntities(item.toString())));
1417            strBuff.append(";");
1418          }// End while
1419          if (strBuff.charAt(strBuff.length()-1) == ';')
1420            strBuff.deleteCharAt(strBuff.length()-1);
1421        }else{
1422//          strBuff.append(value);
1423          // replace non XML chars in attribute value
1424          strBuff.append(
1425            filterNonXmlChars(replaceCharsWithEntities(value.toString())));
1426        }// End if
1427        strBuff.append("\"");
1428      }// End if
1429    }// End while
1430    return strBuff.toString();
1431  }// writeFeatures()
1432
1433  /** Returns a GateXml document that is a custom XML format for wich there is
1434    * a reader inside GATE called gate.xml.GateFormatXmlHandler.
1435    * What it does is to serialize a GATE document in an XML format.
1436    * @return a string representing a Gate Xml document. If saved in a file,this
1437    * string must be written using the UTF-8 encoding because the first line
1438    * in the generated xml document is <?xml version="1.0" encoding="UTF-8" ?>
1439    */
1440  public String toXml(){
1441    // Initialize the xmlContent with 3 time the size of the current document.
1442    // This is because of the tags size. This measure is made to increase the
1443    // performance of StringBuffer.
1444    StringBuffer xmlContent = new StringBuffer(
1445         DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
1446    // Add xml header
1447    xmlContent.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n");
1448    // Add the root element
1449    xmlContent.append("<GateDocument>\n");
1450    xmlContent.append("<!-- The document's features-->\n\n");
1451    xmlContent.append("<GateDocumentFeatures>\n");
1452
1453    xmlContent.append(featuresToXml(this.getFeatures()));
1454    xmlContent.append("</GateDocumentFeatures>\n");
1455    xmlContent.append("<!-- The document content area with serialized"+
1456                      " nodes -->\n\n");
1457    // Add plain text element
1458    xmlContent.append("<TextWithNodes>");
1459    xmlContent.append(textWithNodes(this.getContent().toString()));
1460    xmlContent.append("</TextWithNodes>\n");
1461    // Serialize as XML all document's annotation sets
1462    // Serialize the default AnnotationSet
1463    StatusListener sListener = (StatusListener)
1464                               gate.gui.MainFrame.getListeners().
1465                               get("gate.event.StatusListener");
1466    if(sListener != null)
1467      sListener.statusChanged("Saving the default annotation set ");
1468    xmlContent.append("<!-- The default annotation set -->\n\n");
1469    xmlContent.append(annotationSetToXml(this.getAnnotations()));
1470    // Serialize all others AnnotationSets
1471    // namedAnnotSets is a Map containing all other named Annotation Sets.
1472    if (namedAnnotSets != null){
1473      Iterator iter = namedAnnotSets.values().iterator();
1474      while(iter.hasNext()){
1475        AnnotationSet annotSet = (AnnotationSet) iter.next();
1476        xmlContent.append("<!-- Named annotation set -->\n\n");
1477        // Serialize it as XML
1478        if(sListener != null) sListener.statusChanged("Saving " +
1479                                                      annotSet.getName()+
1480                                                      " annotation set ");
1481        xmlContent.append(annotationSetToXml(annotSet));
1482      }// End while
1483    }// End if
1484    // Add the end of GateDocument
1485    xmlContent.append("</GateDocument>");
1486    if(sListener != null) sListener.statusChanged("Done !");
1487    // return the XmlGateDocument
1488    return xmlContent.toString();
1489  }// toXml
1490
1491  /** This method filters any non XML char
1492    * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets
1493    * All non XML chars will be replaced with 0x20 (space char) This assures
1494    * that the next time the document is loaded there won't be any problems.
1495    * @param aStrBuffer represents the input String that is filtred. If the
1496    * aStrBuffer is null then an empty string will be returend
1497    * @return the "purified" StringBuffer version of the aStrBuffer
1498    */
1499  private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
1500    if (aStrBuffer == null) return new StringBuffer("");
1501    String space = new String(" ");
1502    for (int i=aStrBuffer.length()-1;i>=0; i--){
1503      if (!isXmlChar(aStrBuffer.charAt(i)))
1504        aStrBuffer.replace(i,i+1,space);
1505    }// End for
1506    return aStrBuffer;
1507  }// filterNonXmlChars()
1508
1509  /** This method decide if a char is a valid XML one or not
1510    * @param ch the char to be tested
1511    * @return true if is a valid XML char and fals if is not.
1512    */
1513  public static boolean isXmlChar(char ch){
1514    if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
1515    if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
1516    if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
1517    if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
1518    return false;
1519  }// End isXmlChar()
1520
1521  /** This method saves a FeatureMap as XML elements.
1522    * @ param aFeatureMap the feature map that has to be saved as XML.
1523    * @ return a String like this: <Feature><Name>...</Name>
1524    * <Value>...</Value></Feature><Feature>...</Feature>
1525    */
1526  private String featuresToXml(FeatureMap aFeatureMap){
1527    StringBuffer str = new StringBuffer("");
1528
1529    if (aFeatureMap == null) return str.toString();
1530
1531    Set keySet = aFeatureMap.keySet();
1532    Iterator keyIterator = keySet.iterator();
1533    while(keyIterator.hasNext()){
1534      Object key = keyIterator.next();
1535      Object value = aFeatureMap.get(key);
1536      if ((key != null) && (value != null)){
1537        String keyClassName = null;
1538        String keyItemClassName = null;
1539        String valueClassName = null;
1540        String valueItemClassName = null;
1541        String key2String = key.toString();
1542        String value2String = value.toString();
1543
1544        Object item = null;
1545        // Test key if it is String, Number or Collection
1546        if (key instanceof java.lang.String ||
1547            key instanceof java.lang.Number ||
1548            key instanceof java.util.Collection)
1549          keyClassName = key.getClass().getName();
1550
1551        // Test value if it is String, Number or Collection
1552        if (value instanceof java.lang.String ||
1553            value instanceof java.lang.Number ||
1554            value instanceof java.util.Collection)
1555          valueClassName = value.getClass().getName();
1556
1557        // Features and values that are not Strings, Numbers or collections
1558        // will be discarded.
1559        if (keyClassName == null || valueClassName == null) continue;
1560
1561        // If key is collection serialize the colection in a specific format
1562        if (key instanceof java.util.Collection){
1563          StringBuffer keyStrBuff = new StringBuffer("");
1564          Iterator iter = ((Collection) key).iterator();
1565          if (iter.hasNext()){
1566            item = iter.next();
1567            if (item instanceof java.lang.Number)
1568              keyItemClassName = item.getClass().getName();
1569            else
1570              keyItemClassName = String.class.getName();
1571            keyStrBuff.append(item.toString());
1572          }// End if
1573          while (iter.hasNext()){
1574            item = iter.next();
1575            keyStrBuff.append(";" + item.toString());
1576          }// End while
1577          key2String = keyStrBuff.toString();
1578        }// End if
1579        // If key is collection serialize the colection in a specific format
1580        if (value instanceof java.util.Collection){
1581          StringBuffer valueStrBuff = new StringBuffer("");
1582          Iterator iter = ((Collection) value).iterator();
1583          if (iter.hasNext()){
1584            item = iter.next();
1585            if (item instanceof java.lang.Number)
1586              valueItemClassName = item.getClass().getName();
1587            else
1588              valueItemClassName = String.class.getName();
1589            valueStrBuff.append(item.toString());
1590          }// End if
1591          while (iter.hasNext()){
1592            item = iter.next();
1593            valueStrBuff.append(";" + item.toString());
1594          }// End while
1595          value2String = valueStrBuff.toString();
1596        }// End if
1597        str.append("<Feature>\n  <Name");
1598        if (keyClassName != null)
1599          str.append(" className=\""+keyClassName+"\"");
1600        if (keyItemClassName != null)
1601          str.append(" itemClassName=\""+keyItemClassName+"\"");
1602        str.append(">");
1603        str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
1604        str.append("</Name>\n  <Value");
1605        if (valueClassName != null)
1606          str.append(" className=\"" + valueClassName + "\"");
1607        if (valueItemClassName != null)
1608          str.append(" itemClassName=\"" + valueItemClassName + "\"");
1609        str.append(">");
1610        str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
1611        str.append("</Value>\n</Feature>\n");
1612      }// End if
1613    }// end While
1614    return str.toString();
1615  }//featuresToXml
1616
1617  /** This method replace all chars that appears in the anInputString and also
1618    * that are in the entitiesMap with their corresponding entity
1619    * @param anInputString the string analyzed. If it is null then returns the
1620    *  empty string
1621    * @return a string representing the input string with chars replaced with
1622    *  entities
1623    */
1624  private StringBuffer replaceCharsWithEntities(String anInputString){
1625    if (anInputString == null) return new StringBuffer("");
1626    StringBuffer strBuff = new StringBuffer(anInputString);
1627    for (int i=strBuff.length()-1; i>=0; i--){
1628      Character ch = new Character(strBuff.charAt(i));
1629      if (entitiesMap.keySet().contains(ch)){
1630        strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
1631      }// End if
1632    }// End for
1633    return strBuff;
1634  }//replaceCharsWithEntities()
1635
1636  /** This method creates Node XML elements and inserts them at the
1637    * corresponding offset inside the text. Nodes are created from the default
1638    * annotation set, as well as from all existing named annotation sets.
1639    * @param aText The text representing the document's plain text.
1640    * @return The text with empty <Node id="NodeId"/> elements.
1641    */
1642  private String textWithNodes(String aText){
1643    if (aText == null) return new String("");
1644    StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
1645
1646    // Construct a map from offsets to Chars
1647    TreeMap offsets2CharsMap = new TreeMap();
1648    if (aText.length()!= 0){
1649      // Fill the offsets2CharsMap with all the indices where special chars appear
1650      buildEntityMapFromString(aText,offsets2CharsMap);
1651    }//End if
1652    // Construct the offsetsSet for all nodes belonging to this document
1653    TreeSet offsetsSet = new TreeSet();
1654    Iterator annotSetIter = this.getAnnotations().iterator();
1655    while (annotSetIter.hasNext()){
1656      Annotation annot = (Annotation) annotSetIter.next();
1657      offsetsSet.add(annot.getStartNode().getOffset());
1658      offsetsSet.add(annot.getEndNode().getOffset());
1659    }// end While
1660    // Get the nodes from all other named annotation sets.
1661    if (namedAnnotSets != null){
1662      Iterator iter = namedAnnotSets.values().iterator();
1663      while(iter.hasNext()){
1664        AnnotationSet annotSet = (AnnotationSet) iter.next();
1665        Iterator iter2 = annotSet.iterator();
1666        while(iter2.hasNext()){
1667          Annotation annotTmp = (Annotation) iter2.next();
1668          offsetsSet.add(annotTmp.getStartNode().getOffset());
1669          offsetsSet.add(annotTmp.getEndNode().getOffset());
1670        }// End while
1671      }// End while
1672    }// End if
1673    // offsetsSet is ordered in ascending order because the structure
1674    // is a TreeSet
1675
1676    if (offsetsSet.isEmpty()){
1677      return replaceCharsWithEntities(aText).toString();
1678    }// End if
1679    // Iterate through all nodes from anAnnotSet and transform them to
1680    // XML elements. Then insert those elements at the node's offset into the
1681    // textWithNodes .
1682    while (!offsetsSet.isEmpty()){
1683      Long offset = (Long) offsetsSet.last();
1684      // Eliminate the offset from the list in order to create more memory space
1685      offsetsSet.remove(offset);
1686      // Use offset
1687      int offsetValue = offset.intValue();
1688      String strNode = "<Node id=\"" + offsetValue + "\"/>";
1689      // Before inserting this string into the textWithNodes, check to see if
1690      // there are any chars to be replaced with their corresponding entities
1691      if (!offsets2CharsMap.isEmpty()){
1692        Integer offsChar = (Integer) offsets2CharsMap.lastKey();
1693        while( !offsets2CharsMap.isEmpty() &&
1694                       offsChar.intValue() >= offset.intValue()){
1695          // Replace the char at offsChar with its corresponding entity form
1696          // the entitiesMap.
1697          textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
1698          (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1699          // Discard the offsChar after it was used because this offset will
1700          // never appear again
1701          offsets2CharsMap.remove(offsChar);
1702          // Investigate next offsChar
1703          if (!offsets2CharsMap.isEmpty())
1704            offsChar = (Integer) offsets2CharsMap.lastKey();
1705        }// End while
1706      }// End if
1707      // Now it is safe to insert the node
1708      textWithNodes.insert(offsetValue,strNode);
1709    }// end while
1710    // Need to replace the entities in the remaining text, if there is any text
1711    // So, if there are any more items in offsets2CharsMap they need to be
1712    // replaced
1713    while (!offsets2CharsMap.isEmpty()){
1714      Integer offsChar = (Integer) offsets2CharsMap.lastKey();
1715      // Replace the char with its entity
1716      textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
1717      (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1718      // remove the offset from the map
1719      offsets2CharsMap.remove(offsChar);
1720    }// End while
1721    return textWithNodes.toString();
1722  }//textWithNodes()
1723
1724  /** This method saves an AnnotationSet as XML.
1725    * @param anAnnotationSet The annotation set that has to be saved as XML.
1726    * @return a String like this: <AnnotationSet> <Annotation>....
1727    * </AnnotationSet>
1728    */
1729  private String annotationSetToXml(AnnotationSet anAnnotationSet){
1730    StringBuffer str = new StringBuffer("");
1731
1732    if (anAnnotationSet == null){
1733      str.append("<AnnotationSet>\n");
1734      str.append("</AnnotationSet>\n");
1735      return str.toString();
1736    }// End if
1737    if (anAnnotationSet.getName() == null)
1738      str.append("<AnnotationSet>\n");
1739    else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
1740                                                                    "\" >\n");
1741    // Iterate through AnnotationSet and save each Annotation as XML
1742    Iterator iterator = anAnnotationSet.iterator();
1743    while (iterator.hasNext()){
1744      Annotation annot = (Annotation) iterator.next();
1745      str.append("<Annotation " + "Type=\"" + annot.getType() +
1746                  "\" StartNode=\"" + annot.getStartNode().getOffset() +
1747                   "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
1748      str.append(featuresToXml(annot.getFeatures()));
1749      str.append("</Annotation>\n");
1750    }// End while
1751
1752    str.append("</AnnotationSet>\n");
1753    return str.toString();
1754  }// annotationSetToXml
1755
1756  /** Returns a map with the named annotation sets. It returns <code>null</code>
1757   *  if no named annotaton set exists. */
1758  public Map getNamedAnnotationSets() {
1759    return namedAnnotSets;
1760  } // getNamedAnnotationSets
1761
1762  /**
1763   * Removes one of the named annotation sets.
1764   * Note that the default annotation set cannot be removed.
1765   * @param name the name of the annotation set to be removed
1766   */
1767  public void removeAnnotationSet(String name){
1768    Object removed = namedAnnotSets.remove(name);
1769    if(removed != null){
1770      fireAnnotationSetRemoved(
1771        new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
1772    }
1773  }
1774
1775  /** Propagate edit changes to the document content and annotations. */
1776  public void edit(Long start, Long end, DocumentContent replacement)
1777    throws InvalidOffsetException
1778  {
1779    if(! isValidOffsetRange(start, end))
1780      throw new InvalidOffsetException();
1781
1782    if(content != null)
1783      ((DocumentContentImpl) content).edit(start, end, replacement);
1784
1785    if(defaultAnnots != null)
1786      ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
1787
1788    if(namedAnnotSets != null) {
1789      Iterator iter = namedAnnotSets.values().iterator();
1790      while(iter.hasNext())
1791        ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
1792    }
1793
1794  } // edit(start,end,replacement)
1795
1796  /** Check that an offset is valid, i.e. it is non-null, greater than
1797    * or equal to 0 and less than the size of the document content.
1798    */
1799  public boolean isValidOffset(Long offset) {
1800    if(offset == null)
1801      return false;
1802
1803    long o = offset.longValue();
1804    if(o > getContent().size().longValue() || o < 0)
1805      return false;
1806
1807    return true;
1808  } // isValidOffset
1809
1810  /** Check that both start and end are valid offsets and that
1811    * they constitute a valid offset range, i.e. start is greater
1812    * than or equal to long.
1813    */
1814  public boolean isValidOffsetRange(Long start, Long end) {
1815    return
1816      isValidOffset(start) && isValidOffset(end) &&
1817      start.longValue() <= end.longValue();
1818  } // isValidOffsetRange(start,end)
1819
1820  /** Sets the nextAnnotationId */
1821  public void setNextAnnotationId(int aNextAnnotationId){
1822    nextAnnotationId = aNextAnnotationId;
1823  }// setNextAnnotationId();
1824
1825  /** Generate and return the next annotation ID */
1826  public Integer getNextAnnotationId() {
1827    return new Integer(nextAnnotationId++);
1828  } // getNextAnnotationId
1829
1830  /** Generate and return the next node ID */
1831  public Integer getNextNodeId() { return new Integer(nextNodeId++); }
1832
1833  /** Ordering based on URL.toString() and the URL offsets (if any) */
1834  public int compareTo(Object o) throws ClassCastException {
1835    DocumentImpl other = (DocumentImpl) o;
1836    return getOrderingString().compareTo(other.getOrderingString());
1837  } // compareTo
1838
1839  /** Utility method to produce a string for comparison in ordering.
1840    * String is based on the source URL and offsets.
1841    */
1842  protected String getOrderingString() {
1843    if(sourceUrl == null) return toString();
1844
1845    StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
1846    if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
1847      orderingString.append(sourceUrlStartOffset.toString());
1848      orderingString.append(sourceUrlEndOffset.toString());
1849    }
1850
1851    return orderingString.toString();
1852  } // getOrderingString()
1853
1854  /** The id of the next new annotation */
1855  protected int nextAnnotationId = 0;
1856
1857  /** The id of the next new node */
1858  protected int nextNodeId = 0;
1859  /** The source URL */
1860  protected URL sourceUrl;
1861
1862  /** The document's URL name. */
1863
1864  /** The content of the document */
1865  protected DocumentContent content;
1866
1867  /** The encoding of the source of the document content */
1868  protected String encoding = "UTF-8";
1869
1870  // Data needed in toXml(AnnotationSet) methos
1871
1872  /** This field indicates whether or not to add the tag
1873    * called GatePreserveFormat to the document. HTML, XML, SGML docs won't
1874    * have this tag added
1875    */
1876//  private boolean addGatePreserveFormatTag = false;
1877
1878  /**
1879   * Used by the XML dump preserving format method to remember the smallest
1880   * annoation ID as a marker for the XML document root.
1881   */
1882  private Integer smallestAnnotationID = null;
1883
1884  /**
1885   * The closing tag for the document root.
1886   */
1887  private String rootEnd;
1888
1889  /** This field is used when creating StringBuffers for toXml() methods.
1890    * The size of the StringBuffer will be docDonctent.size() multiplied by this
1891    * value. It is aimed to improve the performance of StringBuffer
1892    */
1893  private final int DOC_SIZE_MULTIPLICATION_FACTOR = 1;
1894
1895  /** Constant used in the inner class AnnotationComparator to order
1896    * annotations on their start offset
1897    */
1898  private final int ORDER_ON_START_OFFSET = 0;
1899  /** Constant used in the inner class AnnotationComparator to order
1900    * annotations on their end offset
1901    */
1902  private final int ORDER_ON_END_OFFSET = 1;
1903  /** Constant used in the inner class AnnotationComparator to order
1904    * annotations on their ID
1905    */
1906  private final int ORDER_ON_ANNOT_ID = 2;
1907  /** Constant used in the inner class AnnotationComparator to order
1908    * annotations ascending
1909    */
1910  private final int ASC = 3;
1911  /** Constant used in the inner class AnnotationComparator to order
1912    * annotations descending
1913    */
1914  private final int DESC = -3;
1915
1916  /** A map initialized in init() containing entities that needs to be
1917    * replaced in strings
1918    */
1919  private static Map entitiesMap = null;
1920  // Initialize the entities map use when saving as xml
1921  static{
1922    entitiesMap = new HashMap();
1923    entitiesMap.put(new Character('<'),"&lt;");
1924    entitiesMap.put(new Character('>'),"&gt;");
1925    entitiesMap.put(new Character('&'),"&amp;");
1926    entitiesMap.put(new Character('\''),"&apos;");
1927    entitiesMap.put(new Character('"'),"&quot;");
1928    entitiesMap.put(new Character((char)160),"&#160;");
1929    entitiesMap.put(new Character((char)169),"&#169;");
1930  }//static
1931
1932  /** The range that the content comes from at the source URL
1933    * (or null if none).
1934    */
1935  //protected Long[] sourceUrlOffsets;
1936
1937  /** The start of the range that the content comes from at the source URL
1938    * (or null if none).
1939    */
1940  protected Long sourceUrlStartOffset;
1941
1942  /** The end of the range that the content comes from at the source URL
1943    * (or null if none).
1944    */
1945  protected Long sourceUrlEndOffset;
1946
1947  /** The default annotation set */
1948  protected AnnotationSet defaultAnnots;
1949
1950  /** Named sets of annotations */
1951  protected Map namedAnnotSets;
1952
1953  /**
1954   * A property of the document that will be set when the user
1955   * wants to create the document from a string, as opposed to from
1956   * a URL.
1957   */
1958  private String stringContent;
1959
1960  /**
1961   * The stringContent of a document is
1962   * a property of the document that will be set when the user
1963   * wants to create the document from a string, as opposed to from
1964   * a URL.
1965   * <B>Use the <TT>getContent</TT> method instead to get the actual document
1966   * content.</B>
1967   */
1968  public String getStringContent() { return stringContent; }
1969
1970  /**
1971   * The stringContent of a document is
1972   * a property of the document that will be set when the user
1973   * wants to create the document from a string, as opposed to from
1974   * a URL.
1975   * <B>Use the <TT>setContent</TT> method instead to update the actual
1976   * document content.</B>
1977   */
1978  public void setStringContent(String stringContent) {
1979    this.stringContent = stringContent;
1980  } // set StringContent
1981
1982  /** Is the document markup-aware? */
1983  protected Boolean markupAware = new Boolean(false);
1984
1985//  /** Hash code */
1986//  public int hashCode() {
1987//    int code = getContent().hashCode();
1988//    int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
1989//    code += memberCode;
1990//    memberCode = (encoding == null) ? 0 : encoding.hashCode();
1991//    code += memberCode;
1992//    memberCode = (features == null) ? 0 : features.hashCode();
1993//    code += memberCode;
1994//    code += (markupAware.booleanValue()) ? 0 : 1;
1995//    memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
1996//    code += memberCode;
1997//    code += nextAnnotationId;
1998//    code += nextNodeId;
1999//    memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
2000//    code += memberCode;
2001//    memberCode =
2002//      (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
2003//    code += memberCode;
2004//    memberCode =
2005//      (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
2006//    code += memberCode;
2007//    return code;
2008//  } // hashcode
2009
2010  /** String respresentation */
2011  public String toString() {
2012    String n = Strings.getNl();
2013    StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2014    s.append("  content:" + content + n);
2015    s.append("  defaultAnnots:" + defaultAnnots + n);
2016    s.append("  encoding:" + encoding + n);
2017    s.append("  features:" + features + n);
2018    s.append("  markupAware:" + markupAware + n);
2019    s.append("  namedAnnotSets:" + namedAnnotSets + n);
2020    s.append("  nextAnnotationId:" + nextAnnotationId + n);
2021    s.append("  nextNodeId:" + nextNodeId + n);
2022    s.append("  sourceUrl:" + sourceUrl + n);
2023    s.append("  sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2024    s.append("  sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2025    s.append(n);
2026
2027    return s.toString();
2028  } // toString
2029
2030   /** Freeze the serialization UID. */
2031  static final long serialVersionUID = -8456893608311510260L;
2032
2033  /** Inner class needed to compare annotations*/
2034  class AnnotationComparator implements java.util.Comparator {
2035    int orderOn = -1;
2036    int orderType = ASC;
2037    /** Constructs a comparator according to one of three sorter types:
2038      * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
2039      */
2040      public AnnotationComparator(int anOrderOn, int anOrderType){
2041        orderOn = anOrderOn;
2042        orderType = anOrderType;
2043      }// AnnotationComparator()
2044
2045      /**This method must be implemented according to Comparator interface */
2046      public int compare(Object o1, Object o2){
2047        Annotation a1 = (Annotation) o1;
2048        Annotation a2 = (Annotation) o2;
2049        // ORDER_ON_START_OFFSET ?
2050        if (orderOn == ORDER_ON_START_OFFSET){
2051          int result = a1.getStartNode().getOffset().compareTo(
2052                                                a2.getStartNode().getOffset());
2053          if (orderType == ASC){
2054            // ASC
2055            // If they are equal then their ID will decide.
2056            if (result == 0)
2057              return a1.getId().compareTo(a2.getId());
2058            return result;
2059          }else{
2060            // DESC
2061            if (result == 0)
2062              return - (a1.getId().compareTo(a2.getId()));
2063            return -result;
2064          }// End if (orderType == ASC)
2065        }// End if (orderOn == ORDER_ON_START_OFFSET)
2066
2067        // ORDER_ON_END_OFFSET ?
2068        if (orderOn == ORDER_ON_END_OFFSET){
2069          int result = a1.getEndNode().getOffset().compareTo(
2070                                                a2.getEndNode().getOffset());
2071          if (orderType == ASC){
2072            // ASC
2073            // If they are equal then their ID will decide.
2074            if (result == 0)
2075              return - (a1.getId().compareTo(a2.getId()));
2076            return result;
2077          }else{
2078            // DESC
2079            // If they are equal then their ID will decide.
2080            if (result == 0)
2081              return a1.getId().compareTo(a2.getId());
2082            return - result;
2083          }// End if (orderType == ASC)
2084        }// End if (orderOn == ORDER_ON_END_OFFSET)
2085
2086        // ORDER_ON_ANNOT_ID ?
2087        if (orderOn == ORDER_ON_ANNOT_ID){
2088          if (orderType == ASC)
2089            return a1.getId().compareTo(a2.getId());
2090          else
2091            return -(a1.getId().compareTo(a2.getId()));
2092        }// End if
2093        return 0;
2094      }//compare()
2095  } // End inner class AnnotationComparator
2096
2097
2098  private transient Vector documentListeners;
2099  private transient Vector gateListeners;
2100
2101  public synchronized void removeDocumentListener(DocumentListener l) {
2102    if (documentListeners != null && documentListeners.contains(l)) {
2103      Vector v = (Vector) documentListeners.clone();
2104      v.removeElement(l);
2105      documentListeners = v;
2106    }
2107  }
2108  public synchronized void addDocumentListener(DocumentListener l) {
2109    Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
2110    if (!v.contains(l)) {
2111      v.addElement(l);
2112      documentListeners = v;
2113    }
2114  }
2115
2116  protected void fireAnnotationSetAdded(DocumentEvent e) {
2117    if (documentListeners != null) {
2118      Vector listeners = documentListeners;
2119      int count = listeners.size();
2120      for (int i = 0; i < count; i++) {
2121        ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
2122      }
2123    }
2124  }
2125
2126  protected void fireAnnotationSetRemoved(DocumentEvent e) {
2127    if (documentListeners != null) {
2128      Vector listeners = documentListeners;
2129      int count = listeners.size();
2130      for (int i = 0; i < count; i++) {
2131        ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
2132      }
2133    }
2134  }
2135  public void resourceLoaded(CreoleEvent e) {
2136  }
2137  public void resourceUnloaded(CreoleEvent e) {
2138  }
2139  public void datastoreOpened(CreoleEvent e) {
2140  }
2141  public void datastoreCreated(CreoleEvent e) {
2142  }
2143  public void resourceRenamed(Resource resource, String oldName,
2144                              String newName){
2145  }
2146  public void datastoreClosed(CreoleEvent e) {
2147    if (! e.getDatastore().equals(this.getDataStore()))
2148      return;
2149    //close this lr, since it cannot stay open when the DS it comes from
2150    //is closed
2151    Factory.deleteResource(this);
2152  }
2153  public void setLRPersistenceId(Object lrID) {
2154    super.setLRPersistenceId( lrID);
2155    //make persistent documents listen to the creole register
2156    //for events about their DS
2157    Gate.getCreoleRegister().addCreoleListener(this);
2158  }
2159  public void resourceAdopted(DatastoreEvent evt) {
2160  }
2161  public void resourceDeleted(DatastoreEvent evt) {
2162    if(! evt.getSource().equals(this.getDataStore()))
2163      return;
2164    //if an open document is deleted from a DS, then
2165    //it must close itself immediately, as is no longer valid
2166    if(evt.getResourceID().equals(this.getLRPersistenceId()))
2167      Factory.deleteResource(this);
2168  }
2169  public void resourceWritten(DatastoreEvent evt) {
2170  }
2171  public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException {
2172    super.setDataStore( dataStore);
2173    if (this.dataStore != null)
2174      this.dataStore.addDatastoreListener(this);
2175  }
2176
2177} // class DocumentImpl
2178