|
DocumentImpl |
|
1 /* 2 * DocumentImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: DocumentImpl.java,v 1.124 2003/07/29 11:36:02 valyt Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.*; 23 import gate.annotation.*; 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.gui.*; 27 import gate.event.*; 28 29 /** Represents the commonalities between all sorts of documents. 30 * 31 * <H2>Editing</H2> 32 * 33 * <P> 34 * The DocumentImpl class implements the Document interface. 35 * The DocumentContentImpl class models the textual or audio-visual 36 * materials which are the source and content of Documents. 37 * The AnnotationSetImpl class supplies annotations on Documents. 38 * 39 * <P> 40 * Abbreviations: 41 * 42 * <UL> 43 * <LI> 44 * DC = DocumentContent 45 * <LI> 46 * D = Document 47 * <LI> 48 * AS = AnnotationSet 49 * </UL> 50 * 51 * <P> 52 * We add an edit method to each of these classes; for DC and AS 53 * the methods are package private; D has the public method. 54 * 55 * <PRE> 56 * void edit(Long start, Long end, DocumentContent replacement) 57 * throws InvalidOffsetException; 58 * </PRE> 59 * 60 * <P> 61 * D receives edit requests and forwards them to DC and AS. 62 * On DC, this method makes a change to the content - e.g. replacing 63 * a String range from start to end with replacement. (Deletions 64 * are catered for by having replacement = null.) D then calls 65 * AS.edit on each of its annotation sets. 66 * 67 * <P> 68 * On AS, edit calls replacement.size() (i.e. DC.size()) to 69 * figure out how long the replacement is (0 for null). It then 70 * considers annotations that terminate (start or end) in 71 * the altered or deleted range as invalid; annotations that 72 * terminate after the range have their offsets adjusted. 73 * I.e.: 74 * <UL> 75 * <LI> 76 * the nodes that pointed inside the old modified area are invalid now and 77 * will be deleted along with the connected annotations; 78 * <LI> 79 * the nodes that are before the start of the modified area remain 80 * untouched; 81 * <LI> 82 * the nodes that are after the end of the affected area will have the 83 * offset changed according to the formula below. 84 * </UL> 85 * 86 * <P> 87 * A note re. AS and annotations: annotations no longer have 88 * offsets as in the old model, they now have nodes, and nodes 89 * have offsets. 90 * 91 * <P> 92 * To implement AS.edit, we have several indices: 93 * <PRE> 94 * HashMap annotsByStartNode, annotsByEndNode; 95 * </PRE> 96 * which map node ids to annotations; 97 * <PRE> 98 * RBTreeMap nodesByOffset; 99 * </PRE> 100 * which maps offset to Nodes. 101 * 102 * <P> 103 * When we get an edit request, we traverse that part of the 104 * nodesByOffset tree representing the altered or deleted 105 * range of the DC. For each node found, we delete any annotations 106 * that terminate on the node, and then delete the node itself. 107 * We then traverse the rest of the tree, changing the offset 108 * on all remaining nodes by: 109 * <PRE> 110 * newOffset = 111 * oldOffset - 112 * ( 113 * (end - start) - // size of mod 114 * ( (replacement == null) ? 0 : replacement.size() ) // size of repl 115 * ); 116 * </PRE> 117 * Note that we use the same convention as e.g. java.lang.String: start 118 * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd" 119 * range 1-3 = "bc". Examples, for a node with offset 4: 120 * <PRE> 121 * edit(1, 3, "BC"); 122 * newOffset = 4 - ( (3 - 1) - 2 ) = 4 123 * 124 * edit(1, 3, null); 125 * newOffset = 4 - ( (3 - 1) - 0 ) = 2 126 * 127 * edit(1, 3, "BBCC"); 128 * newOffset = 4 - ( (3 - 1) - 4 ) = 6 129 * </PRE> 130 */ 131 public class DocumentImpl 132 extends AbstractLanguageResource implements TextualDocument, CreoleListener, 133 DatastoreListener { 134 /** Debug flag */ 135 private static final boolean DEBUG = false; 136 137 /** If you set this flag to true the original content of the document will 138 * be kept in the document feature. <br> 139 * Default value is false to avoid the unnecessary waste of memory */ 140 private Boolean preserveOriginalContent = new Boolean(false); 141 142 /** If you set this flag to true the repositioning information for 143 * the document will be kept in the document feature. <br> 144 * Default value is false to avoid the unnecessary waste of time and memory 145 */ 146 private Boolean collectRepositioningInfo = new Boolean(false); 147 148 /** 149 * This is a variable which contains the latest crossed over annotation 150 * found during export with preserving format, i.e., toXml(annotations) 151 * method. 152 */ 153 private Annotation crossedOverAnnotation = null; 154 155 /** Default construction. Content left empty. */ 156 public DocumentImpl() { 157 content = new DocumentContentImpl(); 158 } // default construction 159 160 /** Initialise this resource, and return it. */ 161 public Resource init() throws ResourceInstantiationException { 162 // set up the source URL and create the content 163 if(sourceUrl == null) { 164 if(stringContent == null) { 165 throw new ResourceInstantiationException( 166 "The sourceURL and document's content were null." 167 ); 168 } 169 170 content = new DocumentContentImpl(stringContent); 171 getFeatures().put("gate.SourceURL", "created from String"); 172 } else { 173 try { 174 content = new DocumentContentImpl( 175 sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset); 176 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm()); 177 } catch(IOException e) { 178 e.printStackTrace(); 179 throw new ResourceInstantiationException("DocumentImpl.init: " + e); 180 } 181 182 if(preserveOriginalContent.booleanValue() && content != null) { 183 String originalContent = new String( 184 ((DocumentContentImpl) content).getOriginalContent()); 185 getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, 186 originalContent); 187 } // if 188 } 189 190 // set up a DocumentFormat if markup unpacking required 191 if(getMarkupAware().booleanValue()) { 192 DocumentFormat docFormat = 193 DocumentFormat.getDocumentFormat(this, sourceUrl); 194 try { 195 if(docFormat != null){ 196 StatusListener sListener = (StatusListener) 197 gate.gui.MainFrame.getListeners(). 198 get("gate.event.StatusListener"); 199 if(sListener != null) docFormat.addStatusListener(sListener); 200 201 // set the flag if true and if the document format support collecting 202 docFormat.setShouldCollectRepositioning(collectRepositioningInfo); 203 204 if(docFormat.getShouldCollectRepositioning().booleanValue()) { 205 // unpack with collectiong of repositioning information 206 RepositioningInfo info = new RepositioningInfo(); 207 208 String origContent = (String) getFeatures().get( 209 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); 210 211 RepositioningInfo ampCodingInfo = new RepositioningInfo(); 212 if(origContent != null) { 213 boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat; 214 collectInformationForAmpCodding(origContent, ampCodingInfo, 215 shouldCorrectCR); 216 if(docFormat instanceof HtmlDocumentFormat) { 217 collectInformationForWS(origContent, ampCodingInfo); 218 } // if 219 } // if 220 221 docFormat.unpackMarkup(this, info, ampCodingInfo); 222 223 if(origContent != null 224 && docFormat instanceof XmlDocumentFormat) { 225 // CRLF correction of RepositioningInfo 226 correctRepositioningForCRLFInXML(origContent, info); 227 } // if 228 229 getFeatures().put( 230 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info); 231 } 232 else { 233 // normal old fashioned unpack 234 docFormat.unpackMarkup(this); 235 } 236 docFormat.removeStatusListener(sListener); 237 } //if format != null 238 } catch(DocumentFormatException e) { 239 throw new ResourceInstantiationException( 240 "Couldn't unpack markup in document " + sourceUrl.toExternalForm() + 241 " " + e 242 ); 243 } 244 } // if markup aware 245 246 //try{ 247 // FileWriter fw = new FileWriter("d:/temp/doccontent.txt"); 248 // fw.write(getContent().toString()); 249 // fw.flush(); 250 // fw.close(); 251 //}catch(IOException ioe){ 252 // ioe.printStackTrace(); 253 //} 254 255 return this; 256 } // init() 257 258 /** 259 * Correct repositioning information for substitution of "\r\n" with "\n" 260 */ 261 private void correctRepositioningForCRLFInXML(String content, 262 RepositioningInfo info) { 263 int index = -1; 264 265 do { 266 index = content.indexOf("\r\n", index+1); 267 if(index != -1) { 268 info.correctInformationOriginalMove(index, 1); 269 } // if 270 } while(index != -1); 271 } // correctRepositioningForCRLF 272 273 /** 274 * Collect information for substitution of "&xxx;" with "y" 275 * 276 * It couldn't be collected a position information about 277 * some unicode and &-coded symbols during parsing. The parser "hide" the 278 * information about the position of such kind of parsed text. 279 * So, there is minimal chance to have &-coded symbol inside the covered by 280 * repositioning records area. The new record should be created for every 281 * coded symbol outside the existing records. 282 * <BR> 283 * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction 284 * for CRLF substitution is performed. 285 */ 286 private void collectInformationForAmpCodding(String content, 287 RepositioningInfo info, 288 boolean shouldCorrectCR) { 289 290 if(content == null || info == null) return; 291 292 int ampIndex = -1; 293 int semiIndex; 294 295 do { 296 ampIndex = content.indexOf('&', ampIndex+1); 297 if(ampIndex != -1) { 298 semiIndex = content.indexOf(';', ampIndex+1); 299 // have semicolon and it is near enough for amp codding 300 if(semiIndex != -1 && (semiIndex-ampIndex) < 8) { 301 info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1); 302 } 303 else { 304 // no semicolon or it is too far 305 // analyse for amp codding without semicolon 306 int maxEnd = Math.min(ampIndex+8, content.length()); 307 String ampCandidate = content.substring(ampIndex, maxEnd); 308 int ampCodingSize = analyseAmpCodding(ampCandidate); 309 310 if(ampCodingSize != -1) { 311 info.addPositionInfo(ampIndex, ampCodingSize, 0, 1); 312 } // if 313 314 } // if - semicolon found 315 } // if - ampersand found 316 } while (ampIndex != -1); 317 318 // correct the collected information to adjust it's positions 319 // with reported by the parser 320 int index = -1; 321 322 if(shouldCorrectCR) { 323 do { 324 index = content.indexOf("\r\n", index+1); 325 if(index != -1) { 326 info.correctInformationOriginalMove(index, -1); 327 } // if 328 } while(index != -1); 329 } // if 330 } // collectInformationForAmpCodding 331 332 /** 333 * This function compute size of the ampersand codded sequence when 334 * semicolin is not present. 335 */ 336 private int analyseAmpCodding(String content) { 337 int result = -1; 338 339 try { 340 char ch = content.charAt(1); 341 342 switch(ch) { 343 case 'l' : // < 344 case 'L' : // < 345 if(content.charAt(2) == 't' || content.charAt(2) == 'T') { 346 result = 3; 347 } // if 348 break; 349 case 'g' : // > 350 case 'G' : // > 351 if(content.charAt(2) == 't' || content.charAt(2) == 'T') { 352 result = 3; 353 } // if 354 break; 355 case 'a' : // & 356 case 'A' : // & 357 if(content.substring(2, 4).equalsIgnoreCase("mp")) { 358 result = 4; 359 } // if 360 break; 361 case 'q' : // " 362 case 'Q' : // " 363 if(content.substring(2, 5).equalsIgnoreCase("uot")) { 364 result = 5; 365 } // if 366 break; 367 case '#' : // #number (example ‘, 䰸) 368 int endIndex = 2; 369 boolean hexCoded = false; 370 if(content.charAt(2) == 'x' || content.charAt(2) == 'X') { 371 // Hex codding 372 ++endIndex; 373 hexCoded = true; 374 } // if 375 376 while (endIndex < 8 377 && isNumber(content.charAt(endIndex), hexCoded) ) { 378 ++endIndex; 379 } // while 380 result = endIndex; 381 break; 382 } // switch 383 } catch (StringIndexOutOfBoundsException ex) { 384 // do nothing 385 } // catch 386 387 return result; 388 } // analyseAmpCodding 389 390 /** Check for numeric range. If hex is true the A..F range is included */ 391 private boolean isNumber(char ch, boolean hex) { 392 if(ch >= '0' && ch <= '9') return true; 393 394 if(hex) { 395 if(ch >= 'A' && ch <= 'F') return true; 396 if(ch >= 'a' && ch <= 'f') return true; 397 } // if 398 399 return false; 400 } // isNumber 401 402 /** HTML parser perform substitution of multiple whitespaces (WS) with 403 * a single WS. To create correct repositioning information structure we 404 * should keep the information for such multiple WS. 405 * <BR> 406 * The criteria for WS is <code>(ch <= ' ')</code>. 407 */ 408 private void collectInformationForWS(String content, RepositioningInfo info) { 409 410 if(content == null || info == null) return; 411 412 // analyse the content and correct the repositioning information 413 char ch; 414 int startWS, endWS; 415 416 startWS = endWS = -1; 417 int contentLength = content.length(); 418 419 for(int i=0; i<contentLength; ++i) { 420 ch = content.charAt(i); 421 422 // is whitespace 423 if(ch <= ' ') { 424 if(startWS == -1) { 425 startWS = i; 426 } // if 427 endWS = i; 428 } 429 else { 430 if(endWS - startWS > 0) { 431 // put the repositioning information about the WS substitution 432 info.addPositionInfo( 433 (long)startWS, (long)(endWS - startWS + 1), 0, 1); 434 } // if 435 // clear positions 436 startWS = endWS = -1; 437 }// if 438 } // for 439 } // collectInformationForWS 440 441 /** Clear all the data members of the object. */ 442 public void cleanup() { 443 444 defaultAnnots = null; 445 if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty())) 446 namedAnnotSets.clear(); 447 if (DEBUG) Out.prln("Document cleanup called"); 448 if (this.lrPersistentId != null) 449 Gate.getCreoleRegister().removeCreoleListener(this); 450 if(this.getDataStore() != null) 451 this.getDataStore().removeDatastoreListener(this); 452 } // cleanup() 453 454 455 /** Documents are identified by URLs */ 456 public URL getSourceUrl() { return sourceUrl; } 457 458 /** Set method for the document's URL */ 459 public void setSourceUrl(URL sourceUrl) { 460 this.sourceUrl = sourceUrl; 461 } // setSourceUrl 462 463 /** Documents may be packed within files; in this case an optional pair of 464 * offsets refer to the location of the document. 465 */ 466 public Long[] getSourceUrlOffsets() { 467 Long[] sourceUrlOffsets = new Long[2]; 468 sourceUrlOffsets[0] = sourceUrlStartOffset; 469 sourceUrlOffsets[1] = sourceUrlEndOffset; 470 return sourceUrlOffsets; 471 } // getSourceUrlOffsets 472 473 /** 474 * Allow/disallow preserving of the original document content. 475 * If is <B>true</B> the original content will be retrieved from 476 * the DocumentContent object and preserved as document feature. 477 */ 478 public void setPreserveOriginalContent(Boolean b) { 479 preserveOriginalContent = b; 480 } // setPreserveOriginalContent 481 482 /** Get the preserving of content status of the Document. 483 * 484 * @return whether the Document should preserve it's original content. 485 */ 486 public Boolean getPreserveOriginalContent() { 487 return preserveOriginalContent; 488 } // getPreserveOriginalContent 489 490 /** 491 * Allow/disallow collecting of repositioning information. 492 * If is <B>true</B> information will be retrieved and preserved 493 * as document feature.<BR> 494 * Preserving of repositioning information give the possibilities 495 * for converting of coordinates between the original document content and 496 * extracted from the document text. 497 */ 498 public void setCollectRepositioningInfo(Boolean b) { 499 collectRepositioningInfo = b; 500 } // setCollectRepositioningInfo 501 502 /** Get the collectiong and preserving of repositioning information 503 * for the Document. <BR> 504 * Preserving of repositioning information give the possibilities 505 * for converting of coordinates between the original document content and 506 * extracted from the document text. 507 * 508 * @return whether the Document should collect and preserve information. 509 */ 510 public Boolean getCollectRepositioningInfo() { 511 return collectRepositioningInfo; 512 } // getCollectRepositioningInfo 513 514 /** Documents may be packed within files; in this case an optional pair of 515 * offsets refer to the location of the document. This method gets the 516 * start offset. 517 */ 518 public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; } 519 520 /** Documents may be packed within files; in this case an optional pair of 521 * offsets refer to the location of the document. This method sets the 522 * start offset. 523 */ 524 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) { 525 this.sourceUrlStartOffset = sourceUrlStartOffset; 526 } // setSourceUrlStartOffset 527 528 /** Documents may be packed within files; in this case an optional pair of 529 * offsets refer to the location of the document. This method gets the 530 * end offset. 531 */ 532 public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; } 533 534 /** Documents may be packed within files; in this case an optional pair of 535 * offsets refer to the location of the document. This method sets the 536 * end offset. 537 */ 538 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) { 539 this.sourceUrlEndOffset = sourceUrlEndOffset; 540 } // setSourceUrlStartOffset 541 542 /** The content of the document: a String for text; MPEG for video; etc. */ 543 public DocumentContent getContent() { return content; } 544 545 /** Set method for the document content */ 546 public void setContent(DocumentContent content) { this.content = content; } 547 548 /** Get the encoding of the document content source */ 549 public String getEncoding() { 550 //we need to make sure we ALWAYS have an encoding 551 if(encoding == null || encoding.trim().length() == 0){ 552 //no encoding definded: use the platform default 553 encoding = java.nio.charset.Charset.forName( 554 System.getProperty("file.encoding")).name(); 555 } 556 return encoding; 557 } 558 559 /** Set the encoding of the document content source */ 560 public void setEncoding(String encoding) { this.encoding = encoding; } 561 562 /** Get the default set of annotations. The set is created if it 563 * doesn't exist yet. 564 */ 565 public AnnotationSet getAnnotations() { 566 if(defaultAnnots == null){ 567 defaultAnnots = new AnnotationSetImpl(this); 568 fireAnnotationSetAdded(new DocumentEvent( 569 this, DocumentEvent.ANNOTATION_SET_ADDED, null)); 570 }//if 571 return defaultAnnots; 572 } // getAnnotations() 573 574 /** Get a named set of annotations. Creates a new set if one with this 575 * name doesn't exist yet. 576 * If the provided name is null then it returns the default annotation set. 577 */ 578 public AnnotationSet getAnnotations(String name) { 579 if(name == null) return getAnnotations(); 580 if(namedAnnotSets == null) 581 namedAnnotSets = new HashMap(); 582 AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name); 583 584 if(namedSet == null) { 585 namedSet = new AnnotationSetImpl(this, name); 586 namedAnnotSets.put(name, namedSet); 587 588 DocumentEvent evt = new DocumentEvent( 589 this, DocumentEvent.ANNOTATION_SET_ADDED, name 590 ); 591 fireAnnotationSetAdded(evt); 592 } 593 return namedSet; 594 } // getAnnotations(name) 595 596 /** Make the document markup-aware. This will trigger the creation 597 * of a DocumentFormat object at Document initialisation time; the 598 * DocumentFormat object will unpack the markup in the Document and 599 * add it as annotations. Documents are <B>not</B> markup-aware by default. 600 * 601 * @param b markup awareness status. 602 */ 603 public void setMarkupAware(Boolean newMarkupAware) { 604 this.markupAware = newMarkupAware; 605 } 606 607 /** Get the markup awareness status of the Document. 608 * <B>Documents are markup-aware by default.</B> 609 * @return whether the Document is markup aware. 610 */ 611 public Boolean getMarkupAware() { return markupAware; } 612 613 /** Returns an XML document aming to preserve the original markups( 614 * the original markup will be in the same place and format as it was 615 * before processing the document) and include (if possible) 616 * the annotations specified in the aSourceAnnotationSet. 617 * It is equivalent to toXml(aSourceAnnotationSet, true). 618 */ 619 public String toXml(Set aSourceAnnotationSet){ 620 return toXml(aSourceAnnotationSet, true); 621 } 622 623 /** Returns an XML document aming to preserve the original markups( 624 * the original markup will be in the same place and format as it was 625 * before processing the document) and include (if possible) 626 * the annotations specified in the aSourceAnnotationSet. 627 * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost 628 * if they will cause a crosed over situation. 629 * @param aSourceAnnotationSet is an annotation set containing all the 630 * annotations that will be combined with the original marup set. If the 631 * param is <code>null</code> it will only dump the original markups. 632 * @param includeFeatures is a boolean that controls whether the annotation 633 * features should be included or not. If false, only the annotation type 634 * is included in the tag. 635 * @return a string representing an XML document containing the original 636 * markup + dumped annotations form the aSourceAnnotationSet 637 */ 638 public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){ 639 640 if(hasOriginalContentFeatures()) { 641 return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures); 642 } // if 643 644 AnnotationSet originalMarkupsAnnotSet = 645 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 646 647 // Create a dumping annotation set on the document. It will be used for 648 // dumping annotations... 649 // AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 650 List dumpingList = new ArrayList(originalMarkupsAnnotSet.size()); 651 652 // This set will be constructed inside this method. If is not empty, the 653 // annotation contained will be lost. 654 /* if (!dumpingSet.isEmpty()){ 655 Out.prln("WARNING: The dumping annotation set was not empty."+ 656 "All annotation it contained were lost."); 657 dumpingSet.clear(); 658 }// End if 659 */ 660 StatusListener sListener = (StatusListener) 661 gate.gui.MainFrame.getListeners(). 662 get("gate.event.StatusListener"); 663 // Construct the dumping set in that way that all annotations will verify 664 // the condition that there are not annotations which are crossed. 665 // First add all annotation from the original markups 666 if(sListener != null) 667 sListener.statusChanged("Constructing the dumping annotation set."); 668 // dumpingSet.addAll(originalMarkupsAnnotSet); 669 dumpingList.addAll(originalMarkupsAnnotSet); 670 // Then take all the annotations from aSourceAnnotationSet and verify if 671 // they can be inserted safely into the dumpingSet. Where not possible, 672 // report. 673 if (aSourceAnnotationSet != null){ 674 Iterator iter = aSourceAnnotationSet.iterator(); 675 while (iter.hasNext()){ 676 Annotation currentAnnot = (Annotation) iter.next(); 677 if(insertsSafety(dumpingList,currentAnnot)){ 678 // dumpingSet.add(currentAnnot); 679 dumpingList.add(currentAnnot); 680 }else if (crossedOverAnnotation != null && DEBUG){ 681 try { 682 Out.prln("Warning: Annotations were found to violate the " + 683 "crossed over condition: \n" + 684 "1. [" + 685 getContent().getContent( 686 crossedOverAnnotation.getStartNode().getOffset(), 687 crossedOverAnnotation.getEndNode().getOffset()) + 688 " (" + crossedOverAnnotation.getType() + ": " + 689 crossedOverAnnotation.getStartNode().getOffset() + 690 ";" + crossedOverAnnotation.getEndNode().getOffset() + 691 ")]\n" + 692 "2. [" + 693 getContent().getContent( 694 currentAnnot.getStartNode().getOffset(), 695 currentAnnot.getEndNode().getOffset()) + 696 " (" + currentAnnot.getType() + ": " + 697 currentAnnot.getStartNode().getOffset() + 698 ";" + currentAnnot.getEndNode().getOffset() + 699 ")]\nThe second one will be discarded.\n" ); 700 } catch (gate.util.InvalidOffsetException ex) { 701 throw new GateRuntimeException(ex.getMessage()); 702 } 703 }// End if 704 }// End while 705 }// End if 706 707 //kalina: order the dumping list by start offset 708 Collections.sort(dumpingList, new gate.util.OffsetComparator()); 709 710 // The dumpingSet is ready to be exported as XML 711 // Here we go. 712 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 713 StringBuffer xmlDoc = new StringBuffer( 714 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue())); 715 716 // Add xml header if original format was xml 717 String mimeType = getFeatures() == null ? 718 null : 719 (String)getFeatures().get("MimeType"); 720 boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml"); 721 722 if(wasXML){ 723 xmlDoc.append("<?xml version=\"1.0\" encoding=\""); 724 xmlDoc.append(getEncoding()); 725 xmlDoc.append("\" ?>"); 726 xmlDoc.append(Strings.getNl()); 727 }// ENd if 728 // Identify and extract the root annotation from the dumpingSet. 729 theRootAnnotation = identifyTheRootAnnotation(dumpingList); 730 // If a root annotation has been identified then add it eplicitley at the 731 // beginning of the document 732 if (theRootAnnotation != null){ 733 dumpingList.remove(theRootAnnotation); 734 xmlDoc.append(writeStartTag(theRootAnnotation,includeFeatures)); 735 }// End if 736 // Construct and append the rest of the document 737 xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures)); 738 // If a root annotation has been identified then add it eplicitley at the 739 // end of the document 740 if (theRootAnnotation != null){ 741 xmlDoc.append(writeEndTag(theRootAnnotation)); 742 }// End if 743 744 if(sListener != null) sListener.statusChanged("Done."); 745 return xmlDoc.toString(); 746 }//End toXml() 747 748 /** This method verifies if aSourceAnnotation can ve inserted safety into the 749 * aTargetAnnotSet. Safety means that it doesn't violate the crossed over 750 * contition with any annotation from the aTargetAnnotSet. 751 * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation 752 * @param aSourceAnnotation the annotation to be inserted into the 753 * aTargetAnnotSet 754 * @return true if the annotation inserts safety, or false otherwise. 755 */ 756 private boolean insertsSafety(AnnotationSet aTargetAnnotSet, 757 Annotation aSourceAnnotation){ 758 759 if (aTargetAnnotSet == null || aSourceAnnotation == null) { 760 this.crossedOverAnnotation = null; 761 return false; 762 } 763 if (aSourceAnnotation.getStartNode() == null || 764 aSourceAnnotation.getStartNode().getOffset()== null) { 765 this.crossedOverAnnotation = null; 766 return false; 767 } 768 if (aSourceAnnotation.getEndNode() == null || 769 aSourceAnnotation.getEndNode().getOffset()== null) { 770 this.crossedOverAnnotation = null; 771 return false; 772 } 773 774 // Get the start and end offsets 775 Long start = aSourceAnnotation.getStartNode().getOffset(); 776 Long end = aSourceAnnotation.getEndNode().getOffset(); 777 // Read aSourceAnnotation offsets long 778 long s2 = start.longValue(); 779 long e2 = end.longValue(); 780 781 // Obtain a set with all annotations annotations that overlap 782 // totaly or partially with the interval defined by the two provided offsets 783 AnnotationSet as = aTargetAnnotSet.get(start,end); 784 785 // Investigate all the annotations from as to see if there is one that 786 // comes in conflict with aSourceAnnotation 787 Iterator it = as.iterator(); 788 while(it.hasNext()){ 789 Annotation ann = (Annotation) it.next(); 790 // Read ann offsets 791 long s1 = ann.getStartNode().getOffset().longValue(); 792 long e1 = ann.getEndNode().getOffset().longValue(); 793 794 if (s1<s2 && s2<e1 && e1<e2) { 795 this.crossedOverAnnotation = ann; 796 return false; 797 } 798 if (s2<s1 && s1<e2 && e2<e1) { 799 this.crossedOverAnnotation = ann; 800 return false; 801 } 802 }// End while 803 return true; 804 }// insertsSafety() 805 806 private boolean insertsSafety(List aTargetAnnotList, 807 Annotation aSourceAnnotation){ 808 809 if (aTargetAnnotList == null || aSourceAnnotation == null) { 810 this.crossedOverAnnotation = null; 811 return false; 812 } 813 if (aSourceAnnotation.getStartNode() == null || 814 aSourceAnnotation.getStartNode().getOffset()== null) { 815 this.crossedOverAnnotation = null; 816 return false; 817 } 818 if (aSourceAnnotation.getEndNode() == null || 819 aSourceAnnotation.getEndNode().getOffset()== null) { 820 this.crossedOverAnnotation = null; 821 return false; 822 } 823 824 // Get the start and end offsets 825 Long start = aSourceAnnotation.getStartNode().getOffset(); 826 Long end = aSourceAnnotation.getEndNode().getOffset(); 827 // Read aSourceAnnotation offsets long 828 long s2 = start.longValue(); 829 long e2 = end.longValue(); 830 831 // Obtain a set with all annotations annotations that overlap 832 // totaly or partially with the interval defined by the two provided offsets 833 List as = new ArrayList(); 834 for (int i=0; i < aTargetAnnotList.size(); i++) { 835 Annotation annot = (Annotation) aTargetAnnotList.get(i); 836 if (annot.getStartNode().getOffset().longValue() >= s2 837 && 838 annot.getStartNode().getOffset().longValue() <= e2) 839 as.add(annot); 840 else if (annot.getEndNode().getOffset().longValue() >= s2 841 && 842 annot.getEndNode().getOffset().longValue() <= e2) 843 as.add(annot); 844 } 845 846 // Investigate all the annotations from as to see if there is one that 847 // comes in conflict with aSourceAnnotation 848 Iterator it = as.iterator(); 849 while(it.hasNext()){ 850 Annotation ann = (Annotation) it.next(); 851 // Read ann offsets 852 long s1 = ann.getStartNode().getOffset().longValue(); 853 long e1 = ann.getEndNode().getOffset().longValue(); 854 855 if (s1<s2 && s2<e1 && e1<e2) { 856 this.crossedOverAnnotation = ann; 857 return false; 858 } 859 if (s2<s1 && s1<e2 && e2<e1) { 860 this.crossedOverAnnotation = ann; 861 return false; 862 } 863 }// End while 864 return true; 865 }// insertsSafety() 866 867 /** This method saves all the annotations from aDumpAnnotSet and combines 868 * them with the document content. 869 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 870 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 871 * then an empty string will be returned. 872 * @param includeFeatures is a boolean, which controls whether the annotation 873 * features and gate ID are included or not. 874 * @return The XML document obtained from raw text + the information from 875 * the dump annotation set. 876 */ 877 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet, 878 boolean includeFeatures){ 879 String content = null; 880 if (this.getContent()== null) 881 content = new String(""); 882 else 883 content = this.getContent().toString(); 884 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content)); 885 if (aDumpAnnotSet == null) return docContStrBuff.toString(); 886 887 TreeMap offsets2CharsMap = new TreeMap(); 888 if (this.getContent().size().longValue() != 0){ 889 // Fill the offsets2CharsMap with all the indices where 890 // special chars appear 891 buildEntityMapFromString(content,offsets2CharsMap); 892 }//End if 893 // The saving alghorithm is as follows: 894 /////////////////////////////////////////// 895 // Construct a set of annot with all IDs in asc order. 896 // All annotations that end at that offset swap their place in descending 897 // order. For each node write all the tags from left to right. 898 899 // Construct the node set 900 TreeSet offsets = new TreeSet(); 901 Iterator iter = aDumpAnnotSet.iterator(); 902 while (iter.hasNext()){ 903 Annotation annot = (Annotation) iter.next(); 904 offsets.add(annot.getStartNode().getOffset()); 905 offsets.add(annot.getEndNode().getOffset()); 906 }// End while 907 908 // ofsets is sorted in ascending order. 909 // Iterate this set in descending order and remove an offset at each 910 // iteration 911 while (!offsets.isEmpty()){ 912 Long offset = (Long)offsets.last(); 913 // Remove the offset from the set 914 offsets.remove(offset); 915 // Now, use it. 916 // Returns a list with annotations that needs to be serialized in that 917 // offset. 918 List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset); 919 // Attention: the annotation are serialized from left to right 920 // StringBuffer tmpBuff = new StringBuffer(""); 921 StringBuffer tmpBuff = new StringBuffer( 922 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue())); 923 Stack stack = new Stack(); 924 // Iterate through all these annotations and serialize them 925 Iterator it = annotations.iterator(); 926 while(it.hasNext()){ 927 Annotation a = (Annotation) it.next(); 928 it.remove(); 929 // Test if a Ends at offset 930 if ( offset.equals(a.getEndNode().getOffset()) ){ 931 // Test if a Starts at offset 932 if ( offset.equals(a.getStartNode().getOffset()) ){ 933 // Here, the annotation a Starts and Ends at the offset 934 if ( null != a.getFeatures().get("isEmptyAndSpan") && 935 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 936 937 // Assert: annotation a with start == end and isEmptyAndSpan 938 tmpBuff.append(writeStartTag(a, includeFeatures)); 939 stack.push(a); 940 }else{ 941 // Assert annotation a with start == end and an empty tag 942 tmpBuff.append(writeEmptyTag(a)); 943 // The annotation is removed from dumped set 944 aDumpAnnotSet.remove(a); 945 }// End if 946 }else{ 947 // Here the annotation a Ends at the offset. 948 // In this case empty the stack and write the end tag 949 if (!stack.isEmpty()){ 950 while(!stack.isEmpty()){ 951 Annotation a1 = (Annotation)stack.pop(); 952 tmpBuff.append(writeEndTag(a1)); 953 }// End while 954 }// End if 955 tmpBuff.append(writeEndTag(a)); 956 }// End if 957 }else{ 958 // The annotation a does NOT end at the offset. Let's see if it starts 959 // at the offset 960 if ( offset.equals(a.getStartNode().getOffset()) ){ 961 // The annotation a starts at the offset. 962 // In this case empty the stack and write the end tag 963 if (!stack.isEmpty()){ 964 while(!stack.isEmpty()){ 965 Annotation a1 = (Annotation)stack.pop(); 966 tmpBuff.append(writeEndTag(a1)); 967 }// End while 968 }// End if 969 tmpBuff.append(writeStartTag(a, includeFeatures)); 970 // The annotation is removed from dumped set 971 aDumpAnnotSet.remove(a); 972 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 973 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 974 }// End while(it.hasNext()){ 975 976 // In this case empty the stack and write the end tag 977 if (!stack.isEmpty()){ 978 while(!stack.isEmpty()){ 979 Annotation a1 = (Annotation)stack.pop(); 980 tmpBuff.append(writeEndTag(a1)); 981 }// End while 982 }// End if 983 984 // Before inserting tmpBuff into docContStrBuff we need to check 985 // if there are chars to be replaced and if there are, they would be 986 // replaced. 987 if (!offsets2CharsMap.isEmpty()){ 988 Long offsChar = (Long) offsets2CharsMap.lastKey(); 989 while( !offsets2CharsMap.isEmpty() && 990 offsChar.intValue() >= offset.intValue()){ 991 // Replace the char at offsChar with its corresponding entity form 992 // the entitiesMap. 993 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 994 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 995 // Discard the offsChar after it was used. 996 offsets2CharsMap.remove(offsChar); 997 // Investigate next offsChar 998 if (!offsets2CharsMap.isEmpty()) 999 offsChar = (Long) offsets2CharsMap.lastKey(); 1000 }// End while 1001 }// End if 1002 // Insert tmpBuff to the location where it belongs in docContStrBuff 1003 docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); 1004 }// End while(!offsets.isEmpty()) 1005 // Need to replace the entities in the remaining text, if there is any text 1006 // So, if there are any more items in offsets2CharsMap they need to be 1007 // replaced 1008 while (!offsets2CharsMap.isEmpty()){ 1009 Long offsChar = (Long) offsets2CharsMap.lastKey(); 1010 // Replace the char with its entity 1011 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 1012 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1013 // remove the offset from the map 1014 offsets2CharsMap.remove(offsChar); 1015 }// End while 1016 return docContStrBuff.toString(); 1017 }// saveAnnotationSetAsXml() 1018 1019 private String saveAnnotationSetAsXml(List aDumpAnnotList, 1020 boolean includeFeatures){ 1021 String content = null; 1022 if (this.getContent()== null) 1023 content = new String(""); 1024 else 1025 content = this.getContent().toString(); 1026 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content)); 1027 if (aDumpAnnotList == null) return docContStrBuff.toString(); 1028 1029 StringBuffer resultStrBuff = new StringBuffer( 1030 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue())); 1031 // last offset position used to extract portions of text 1032 Long lastOffset = new Long(0); 1033 1034 TreeMap offsets2CharsMap = new TreeMap(); 1035 HashMap annotsForOffset = new HashMap(100); 1036 if (this.getContent().size().longValue() != 0){ 1037 // Fill the offsets2CharsMap with all the indices where 1038 // special chars appear 1039 buildEntityMapFromString(content,offsets2CharsMap); 1040 }//End if 1041 // The saving alghorithm is as follows: 1042 /////////////////////////////////////////// 1043 // Construct a set of annot with all IDs in asc order. 1044 // All annotations that end at that offset swap their place in descending 1045 // order. For each node write all the tags from left to right. 1046 1047 // Construct the node set 1048 TreeSet offsets = new TreeSet(); 1049 Iterator iter = aDumpAnnotList.iterator(); 1050 Annotation annot; 1051 Long start; 1052 Long end; 1053 while (iter.hasNext()){ 1054 annot = (Annotation) iter.next(); 1055 start = annot.getStartNode().getOffset(); 1056 end = annot.getEndNode().getOffset(); 1057 offsets.add(start); 1058 offsets.add(end); 1059 if (annotsForOffset.containsKey(start)) { 1060 ((List) annotsForOffset.get(start)).add(annot); 1061 } else { 1062 List newList = new ArrayList(10); 1063 newList.add(annot); 1064 annotsForOffset.put(start, newList); 1065 } 1066 if (annotsForOffset.containsKey(end)) { 1067 ((List) annotsForOffset.get(end)).add(annot); 1068 } else { 1069 List newList = new ArrayList(10); 1070 newList.add(annot); 1071 annotsForOffset.put(end, newList); 1072 } 1073 }// End while 1074 1075 // ofsets is sorted in ascending order. 1076 // Iterate this set in descending order and remove an offset at each 1077 // iteration 1078 Iterator offsetIt = offsets.iterator(); 1079 Long offset; 1080 List annotations; 1081 // This don't have to be a large buffer - just for tags 1082 StringBuffer tmpBuff = new StringBuffer(255); 1083 Stack stack = new Stack(); 1084 while (offsetIt.hasNext()){ 1085 offset = (Long)offsetIt.next(); 1086 // Now, use it. 1087 // Returns a list with annotations that needs to be serialized in that 1088 // offset. 1089 annotations = (List) annotsForOffset.get(offset); 1090 // order annotations in list for offset to print tags in correct order 1091 annotations = getAnnotationsForOffset(annotations, offset); 1092 // clear structures 1093 tmpBuff.setLength(0); 1094 stack.clear(); 1095 1096 // Iterate through all these annotations and serialize them 1097 Iterator it = annotations.iterator(); 1098 Annotation a; 1099 Annotation annStack; 1100 while(it.hasNext()){ 1101 a = (Annotation) it.next(); 1102 // Test if a Ends at offset 1103 if ( offset.equals(a.getEndNode().getOffset()) ){ 1104 // Test if a Starts at offset 1105 if ( offset.equals(a.getStartNode().getOffset()) ){ 1106 // Here, the annotation a Starts and Ends at the offset 1107 if ( null != a.getFeatures().get("isEmptyAndSpan") && 1108 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 1109 1110 // Assert: annotation a with start == end and isEmptyAndSpan 1111 tmpBuff.append(writeStartTag(a, includeFeatures)); 1112 stack.push(a); 1113 }else{ 1114 // Assert annotation a with start == end and an empty tag 1115 tmpBuff.append(writeEmptyTag(a)); 1116 // The annotation is removed from dumped set 1117 aDumpAnnotList.remove(a); 1118 }// End if 1119 }else{ 1120 // Here the annotation a Ends at the offset. 1121 // In this case empty the stack and write the end tag 1122 if (!stack.isEmpty()){ 1123 while(!stack.isEmpty()){ 1124 annStack = (Annotation)stack.pop(); 1125 tmpBuff.append(writeEndTag(annStack)); 1126 }// End while 1127 }// End if 1128 tmpBuff.append(writeEndTag(a)); 1129 }// End if 1130 }else{ 1131 // The annotation a does NOT end at the offset. Let's see if it starts 1132 // at the offset 1133 if ( offset.equals(a.getStartNode().getOffset()) ){ 1134 // The annotation a starts at the offset. 1135 // In this case empty the stack and write the end tag 1136 if (!stack.isEmpty()){ 1137 while(!stack.isEmpty()){ 1138 annStack = (Annotation)stack.pop(); 1139 tmpBuff.append(writeEndTag(annStack)); 1140 }// End while 1141 }// End if 1142 tmpBuff.append(writeStartTag(a, includeFeatures)); 1143 // The annotation is removed from dumped set 1144 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 1145 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 1146 }// End while(it.hasNext()){ 1147 1148 // In this case empty the stack and write the end tag 1149 if (!stack.isEmpty()){ 1150 while(!stack.isEmpty()){ 1151 annStack = (Annotation)stack.pop(); 1152 tmpBuff.append(writeEndTag(annStack)); 1153 }// End while 1154 }// End if 1155 1156 // extract text from content and replace spec chars 1157 StringBuffer partText = new StringBuffer(); 1158 SortedMap offsetsInRange = 1159 offsets2CharsMap.subMap(lastOffset, offset); 1160 Long tmpOffset; 1161 Long tmpLastOffset = lastOffset; 1162 String replacement; 1163 1164 // Before inserting tmpBuff into the buffer we need to check 1165 // if there are chars to be replaced in range 1166 if(!offsetsInRange.isEmpty()) { 1167 tmpOffset = (Long) offsetsInRange.firstKey(); 1168 replacement = 1169 (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset)); 1170 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(), 1171 tmpOffset.intValue())); 1172 partText.append(replacement); 1173 tmpLastOffset = new Long(tmpOffset.longValue()+1); 1174 } 1175 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(), 1176 offset.intValue())); 1177 resultStrBuff.append(partText); 1178 // Insert tmpBuff to the result string 1179 resultStrBuff.append(tmpBuff.toString()); 1180 lastOffset = offset; 1181 }// End while(!offsets.isEmpty()) 1182 1183 // get text to the end of content 1184 // extract text from content and replace spec chars 1185 StringBuffer partText = new StringBuffer(); 1186 SortedMap offsetsInRange = 1187 offsets2CharsMap.subMap(lastOffset, new Long(docContStrBuff.length())); 1188 Long tmpOffset; 1189 Long tmpLastOffset = lastOffset; 1190 String replacement; 1191 1192 // Need to replace the entities in the remaining text, if there is any text 1193 // So, if there are any more items in offsets2CharsMap for remaining text 1194 // they need to be replaced 1195 if(!offsetsInRange.isEmpty()) { 1196 tmpOffset = (Long) offsetsInRange.firstKey(); 1197 replacement = 1198 (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset)); 1199 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(), 1200 tmpOffset.intValue())); 1201 partText.append(replacement); 1202 tmpLastOffset = new Long(tmpOffset.longValue()+1); 1203 } 1204 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(), 1205 docContStrBuff.length())); 1206 resultStrBuff.append(partText); 1207 1208 return resultStrBuff.toString(); 1209 }// saveAnnotationSetAsXml() 1210 1211/* Old method created by Cristian. Create content backward. 1212 1213 private String saveAnnotationSetAsXml(List aDumpAnnotList, 1214 boolean includeFeatures){ 1215 String content = null; 1216 if (this.getContent()== null) 1217 content = new String(""); 1218 else 1219 content = this.getContent().toString(); 1220 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content)); 1221 if (aDumpAnnotList == null) return docContStrBuff.toString(); 1222 1223 TreeMap offsets2CharsMap = new TreeMap(); 1224 HashMap annotsForOffset = new HashMap(100); 1225 if (this.getContent().size().longValue() != 0){ 1226 // Fill the offsets2CharsMap with all the indices where 1227 // special chars appear 1228 buildEntityMapFromString(content,offsets2CharsMap); 1229 }//End if 1230 // The saving alghorithm is as follows: 1231 /////////////////////////////////////////// 1232 // Construct a set of annot with all IDs in asc order. 1233 // All annotations that end at that offset swap their place in descending 1234 // order. For each node write all the tags from left to right. 1235 1236 // Construct the node set 1237 TreeSet offsets = new TreeSet(); 1238 Iterator iter = aDumpAnnotList.iterator(); 1239 while (iter.hasNext()){ 1240 Annotation annot = (Annotation) iter.next(); 1241 offsets.add(annot.getStartNode().getOffset()); 1242 offsets.add(annot.getEndNode().getOffset()); 1243 if (annotsForOffset.containsKey(annot.getStartNode().getOffset())) { 1244 ((List) annotsForOffset.get(annot.getStartNode().getOffset())).add(annot); 1245 } else { 1246 List newList = new ArrayList(10); 1247 newList.add(annot); 1248 annotsForOffset.put(annot.getStartNode().getOffset(), newList); 1249 } 1250 if (annotsForOffset.containsKey(annot.getEndNode().getOffset())) { 1251 ((List) annotsForOffset.get(annot.getEndNode().getOffset())).add(annot); 1252 } else { 1253 List newList = new ArrayList(10); 1254 newList.add(annot); 1255 annotsForOffset.put(annot.getEndNode().getOffset(), newList); 1256 } 1257 }// End while 1258 1259 // ofsets is sorted in ascending order. 1260 // Iterate this set in descending order and remove an offset at each 1261 // iteration 1262 while (!offsets.isEmpty()){ 1263 Long offset = (Long)offsets.last(); 1264 // Remove the offset from the set 1265 offsets.remove(offset); 1266 // Now, use it. 1267 // Returns a list with annotations that needs to be serialized in that 1268 // offset. 1269// List annotations = getAnnotationsForOffset(aDumpAnnotList,offset); 1270 List annotations = (List) annotsForOffset.get(offset); 1271 annotations = getAnnotationsForOffset(annotations,offset); 1272 // Attention: the annotation are serialized from left to right 1273// StringBuffer tmpBuff = new StringBuffer(""); 1274 StringBuffer tmpBuff = new StringBuffer( 1275 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue())); 1276 Stack stack = new Stack(); 1277 // Iterate through all these annotations and serialize them 1278 Iterator it = annotations.iterator(); 1279 while(it.hasNext()){ 1280 Annotation a = (Annotation) it.next(); 1281 it.remove(); 1282 // Test if a Ends at offset 1283 if ( offset.equals(a.getEndNode().getOffset()) ){ 1284 // Test if a Starts at offset 1285 if ( offset.equals(a.getStartNode().getOffset()) ){ 1286 // Here, the annotation a Starts and Ends at the offset 1287 if ( null != a.getFeatures().get("isEmptyAndSpan") && 1288 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 1289 1290 // Assert: annotation a with start == end and isEmptyAndSpan 1291 tmpBuff.append(writeStartTag(a, includeFeatures)); 1292 stack.push(a); 1293 }else{ 1294 // Assert annotation a with start == end and an empty tag 1295 tmpBuff.append(writeEmptyTag(a)); 1296 // The annotation is removed from dumped set 1297 aDumpAnnotList.remove(a); 1298 }// End if 1299 }else{ 1300 // Here the annotation a Ends at the offset. 1301 // In this case empty the stack and write the end tag 1302 if (!stack.isEmpty()){ 1303 while(!stack.isEmpty()){ 1304 Annotation a1 = (Annotation)stack.pop(); 1305 tmpBuff.append(writeEndTag(a1)); 1306 }// End while 1307 }// End if 1308 tmpBuff.append(writeEndTag(a)); 1309 }// End if 1310 }else{ 1311 // The annotation a does NOT end at the offset. Let's see if it starts 1312 // at the offset 1313 if ( offset.equals(a.getStartNode().getOffset()) ){ 1314 // The annotation a starts at the offset. 1315 // In this case empty the stack and write the end tag 1316 if (!stack.isEmpty()){ 1317 while(!stack.isEmpty()){ 1318 Annotation a1 = (Annotation)stack.pop(); 1319 tmpBuff.append(writeEndTag(a1)); 1320 }// End while 1321 }// End if 1322 tmpBuff.append(writeStartTag(a, includeFeatures)); 1323 // The annotation is removed from dumped set 1324 aDumpAnnotList.remove(a); 1325 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 1326 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 1327 }// End while(it.hasNext()){ 1328 1329 // In this case empty the stack and write the end tag 1330 if (!stack.isEmpty()){ 1331 while(!stack.isEmpty()){ 1332 Annotation a1 = (Annotation)stack.pop(); 1333 tmpBuff.append(writeEndTag(a1)); 1334 }// End while 1335 }// End if 1336 1337 // Before inserting tmpBuff into docContStrBuff we need to check 1338 // if there are chars to be replaced and if there are, they would be 1339 // replaced. 1340 if (!offsets2CharsMap.isEmpty()){ 1341 Long offsChar = (Long) offsets2CharsMap.lastKey(); 1342 while( !offsets2CharsMap.isEmpty() && 1343 offsChar.intValue() >= offset.intValue()){ 1344 // Replace the char at offsChar with its corresponding entity form 1345 // the entitiesMap. 1346 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 1347 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1348 // Discard the offsChar after it was used. 1349 offsets2CharsMap.remove(offsChar); 1350 // Investigate next offsChar 1351 if (!offsets2CharsMap.isEmpty()) 1352 offsChar = (Long) offsets2CharsMap.lastKey(); 1353 }// End while 1354 }// End if 1355 // Insert tmpBuff to the location where it belongs in docContStrBuff 1356 docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); 1357 }// End while(!offsets.isEmpty()) 1358 // Need to replace the entities in the remaining text, if there is any text 1359 // So, if there are any more items in offsets2CharsMap they need to be 1360 // replaced 1361 while (!offsets2CharsMap.isEmpty()){ 1362 Long offsChar = (Long) offsets2CharsMap.lastKey(); 1363 // Replace the char with its entity 1364 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 1365 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1366 // remove the offset from the map 1367 offsets2CharsMap.remove(offsChar); 1368 }// End while 1369 return docContStrBuff.toString(); 1370 }// saveAnnotationSetAsXml() 1371*/ 1372 1373 /** 1374 * Return true only if the document has features for original content and 1375 * repositioning information. 1376 */ 1377 private boolean hasOriginalContentFeatures() { 1378 FeatureMap features = getFeatures(); 1379 boolean result = false; 1380 1381 result = 1382 (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null) 1383 && 1384 (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME) 1385 != null); 1386 1387 return result; 1388 } // hasOriginalContentFeatures 1389 1390 /** This method saves all the annotations from aDumpAnnotSet and combines 1391 * them with the original document content, if preserved as feature. 1392 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 1393 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 1394 * then an empty string will be returned. 1395 * @param includeFeatures is a boolean, which controls whether the annotation 1396 * features and gate ID are included or not. 1397 * @return The XML document obtained from raw text + the information from 1398 * the dump annotation set. 1399 */ 1400 private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet, 1401 boolean includeFeatures){ 1402 StringBuffer docContStrBuff; 1403 1404 String origContent; 1405 1406 origContent = 1407 (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); 1408 if(origContent == null) { 1409 origContent = ""; 1410 } // if 1411 1412 long originalContentSize = origContent.length(); 1413 1414 RepositioningInfo repositioning = (RepositioningInfo) 1415 getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); 1416 1417 docContStrBuff = new StringBuffer(origContent); 1418 if (aSourceAnnotationSet == null) return docContStrBuff.toString(); 1419 1420 StatusListener sListener = (StatusListener) 1421 gate.gui.MainFrame.getListeners(). 1422 get("gate.event.StatusListener"); 1423 1424 AnnotationSet originalMarkupsAnnotSet = 1425 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1426 // Create a dumping annotation set on the document. It will be used for 1427 // dumping annotations... 1428 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 1429 if(sListener != null) 1430 sListener.statusChanged("Constructing the dumping annotation set."); 1431 // Then take all the annotations from aSourceAnnotationSet and verify if 1432 // they can be inserted safely into the dumpingSet. Where not possible, 1433 // report. 1434 if (aSourceAnnotationSet != null){ 1435 Iterator iter = aSourceAnnotationSet.iterator(); 1436 Annotation currentAnnot; 1437 while (iter.hasNext()){ 1438 currentAnnot = (Annotation) iter.next(); 1439 if(insertsSafety(originalMarkupsAnnotSet, currentAnnot) 1440 && insertsSafety(dumpingSet, currentAnnot)){ 1441 dumpingSet.add(currentAnnot); 1442 }else{ 1443 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + 1444 ", startOffset=" + currentAnnot.getStartNode().getOffset() + 1445 ", endOffset=" + currentAnnot.getEndNode().getOffset() + 1446 ", type=" + currentAnnot.getType()+ " was found to violate the" + 1447 " crossed over condition. It will be discarded"); 1448 }// End if 1449 }// End while 1450 }// End if 1451 1452 // The dumpingSet is ready to be exported as XML 1453 // Here we go. 1454 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 1455 1456 /////////////////////////////////////////// 1457 // Construct a set of annot with all IDs in asc order. 1458 // All annotations that end at that offset swap their place in descending 1459 // order. For each node write all the tags from left to right. 1460 1461 // Construct the node set 1462 TreeSet offsets = new TreeSet(); 1463 Iterator iter = aSourceAnnotationSet.iterator(); 1464 while (iter.hasNext()){ 1465 Annotation annot = (Annotation) iter.next(); 1466 offsets.add(annot.getStartNode().getOffset()); 1467 offsets.add(annot.getEndNode().getOffset()); 1468 }// End while 1469 1470 // ofsets is sorted in ascending order. 1471 // Iterate this set in descending order and remove an offset at each 1472 // iteration 1473 while (!offsets.isEmpty()){ 1474 Long offset = (Long)offsets.last(); 1475 // Remove the offset from the set 1476 offsets.remove(offset); 1477 // Now, use it. 1478 // Returns a list with annotations that needs to be serialized in that 1479 // offset. 1480 List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset); 1481 // Attention: the annotation are serialized from left to right 1482 StringBuffer tmpBuff = new StringBuffer(""); 1483 Stack stack = new Stack(); 1484 // Iterate through all these annotations and serialize them 1485 Iterator it = annotations.iterator(); 1486 Annotation a = null; 1487 while(it.hasNext()) { 1488 a = (Annotation) it.next(); 1489 it.remove(); 1490 // Test if a Ends at offset 1491 if ( offset.equals(a.getEndNode().getOffset()) ){ 1492 // Test if a Starts at offset 1493 if ( offset.equals(a.getStartNode().getOffset()) ){ 1494 // Here, the annotation a Starts and Ends at the offset 1495 if ( null != a.getFeatures().get("isEmptyAndSpan") && 1496 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 1497 1498 // Assert: annotation a with start == end and isEmptyAndSpan 1499 tmpBuff.append(writeStartTag(a, includeFeatures, false)); 1500 stack.push(a); 1501 }else{ 1502 // Assert annotation a with start == end and an empty tag 1503 tmpBuff.append(writeEmptyTag(a, false)); 1504 // The annotation is removed from dumped set 1505 aSourceAnnotationSet.remove(a); 1506 }// End if 1507 }else{ 1508 // Here the annotation a Ends at the offset. 1509 // In this case empty the stack and write the end tag 1510 while(!stack.isEmpty()){ 1511 Annotation a1 = (Annotation)stack.pop(); 1512 tmpBuff.append(writeEndTag(a1)); 1513 }// End while 1514 tmpBuff.append(writeEndTag(a)); 1515 }// End if 1516 }else{ 1517 // The annotation a does NOT end at the offset. Let's see if it starts 1518 // at the offset 1519 if ( offset.equals(a.getStartNode().getOffset()) ){ 1520 // The annotation a starts at the offset. 1521 // In this case empty the stack and write the end tag 1522 while(!stack.isEmpty()){ 1523 Annotation a1 = (Annotation)stack.pop(); 1524 tmpBuff.append(writeEndTag(a1)); 1525 }// End while 1526 1527 tmpBuff.append(writeStartTag(a, includeFeatures, false)); 1528 // The annotation is removed from dumped set 1529 aSourceAnnotationSet.remove(a); 1530 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 1531 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 1532 }// End while(it.hasNext()){ 1533 1534 // In this case empty the stack and write the end tag 1535 while(!stack.isEmpty()){ 1536 Annotation a1 = (Annotation)stack.pop(); 1537 tmpBuff.append(writeEndTag(a1)); 1538 }// End while 1539 1540 long originalPosition = -1; 1541 boolean backPositioning = 1542 a != null && offset.equals(a.getEndNode().getOffset()); 1543 if ( backPositioning ) { 1544 // end of the annotation correction 1545 originalPosition = 1546 repositioning.getOriginalPos(offset.intValue(), true); 1547 } // if 1548 1549 if(originalPosition == -1) { 1550 originalPosition = repositioning.getOriginalPos(offset.intValue()); 1551 } // if 1552 1553 // Insert tmpBuff to the location where it belongs in docContStrBuff 1554 if(originalPosition != -1 && originalPosition <= originalContentSize ) { 1555 docContStrBuff.insert((int) originalPosition, tmpBuff.toString()); 1556 } 1557 else { 1558 Out.prln("Error in the repositioning. The offset ("+offset.intValue() 1559 +") could not be positioned in the original document. \n" 1560 +"Calculated position is: "+originalPosition 1561 +" placed back: "+backPositioning); 1562 } // if 1563 1564 }// End while(!offsets.isEmpty()) 1565 if (theRootAnnotation != null) 1566 docContStrBuff.append(writeEndTag(theRootAnnotation)); 1567 return docContStrBuff.toString(); 1568 } // saveAnnotationSetAsXmlInOrig() 1569 1570 /** This method returns a list with annotations ordered that way that 1571 * they can be serialized from left to right, at the offset. If one of the 1572 * params is null then an empty list will be returned. 1573 * @param aDumpAnnotSet is a set containing all annotations that will be 1574 * dumped. 1575 * @param offset represent the offset at witch the annotation must start 1576 * AND/OR end. 1577 * @return a list with those annotations that need to be serialized. 1578 */ 1579 private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){ 1580 List annotationList = new LinkedList(); 1581 if (aDumpAnnotSet == null || offset == null) return annotationList; 1582 Set annotThatStartAtOffset = new TreeSet( 1583 new AnnotationComparator(ORDER_ON_END_OFFSET,DESC)); 1584 Set annotThatEndAtOffset = new TreeSet( 1585 new AnnotationComparator(ORDER_ON_START_OFFSET,DESC)); 1586 Set annotThatStartAndEndAtOffset = new TreeSet( 1587 new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC)); 1588 1589 // Fill these tree lists with annotation tat start, end or start and 1590 // end at the offset. 1591 Iterator iter = aDumpAnnotSet.iterator(); 1592 while(iter.hasNext()){ 1593 Annotation ann = (Annotation) iter.next(); 1594 if (offset.equals(ann.getStartNode().getOffset())){ 1595 if (offset.equals(ann.getEndNode().getOffset())) 1596 annotThatStartAndEndAtOffset.add(ann); 1597 else 1598 annotThatStartAtOffset.add(ann); 1599 }else{ 1600 if (offset.equals(ann.getEndNode().getOffset())) 1601 annotThatEndAtOffset.add(ann); 1602 }// End if 1603 }// End while 1604 annotationList.addAll(annotThatEndAtOffset); 1605 annotThatEndAtOffset = null; 1606 annotationList.addAll(annotThatStartAtOffset); 1607 annotThatStartAtOffset = null; 1608 iter = annotThatStartAndEndAtOffset.iterator(); 1609 while(iter.hasNext()){ 1610 Annotation ann = (Annotation) iter.next(); 1611 Iterator it = annotationList.iterator(); 1612 boolean breaked = false; 1613 while (it.hasNext()){ 1614 Annotation annFromList = (Annotation) it.next(); 1615 if (annFromList.getId().intValue() > ann.getId().intValue()){ 1616 annotationList.add(annotationList.indexOf(annFromList),ann); 1617 breaked = true; 1618 break; 1619 }// End if 1620 }// End while 1621 if (!breaked) 1622 annotationList.add(ann); 1623 iter.remove(); 1624 }// End while 1625 return annotationList; 1626 }// getAnnotationsForOffset() 1627 1628 private List getAnnotationsForOffset(List aDumpAnnotList, Long offset){ 1629 List annotationList = new ArrayList(); 1630 if (aDumpAnnotList == null || offset == null) return annotationList; 1631 Set annotThatStartAtOffset; 1632 Set annotThatEndAtOffset; 1633 Set annotThatStartAndEndAtOffset; 1634 annotThatStartAtOffset = new TreeSet( 1635 new AnnotationComparator(ORDER_ON_END_OFFSET, DESC)); 1636 annotThatEndAtOffset = new TreeSet( 1637 new AnnotationComparator(ORDER_ON_START_OFFSET, DESC)); 1638 annotThatStartAndEndAtOffset = new TreeSet( 1639 new AnnotationComparator(ORDER_ON_ANNOT_ID, ASC)); 1640 1641 // Fill these tree lists with annotation tat start, end or start and 1642 // end at the offset. 1643 Iterator iter = aDumpAnnotList.iterator(); 1644 while(iter.hasNext()){ 1645 Annotation ann = (Annotation) iter.next(); 1646 if (offset.equals(ann.getStartNode().getOffset())){ 1647 if (offset.equals(ann.getEndNode().getOffset())) 1648 annotThatStartAndEndAtOffset.add(ann); 1649 else 1650 annotThatStartAtOffset.add(ann); 1651 }else{ 1652 if (offset.equals(ann.getEndNode().getOffset())) 1653 annotThatEndAtOffset.add(ann); 1654 }// End if 1655 }// End while 1656 1657 annotationList.addAll(annotThatEndAtOffset); 1658 annotationList.addAll(annotThatStartAtOffset); 1659 annotThatEndAtOffset = null; 1660 annotThatStartAtOffset = null; 1661 1662 iter = annotThatStartAndEndAtOffset.iterator(); 1663 while(iter.hasNext()){ 1664 Annotation ann = (Annotation) iter.next(); 1665 Iterator it = annotationList.iterator(); 1666 boolean breaked = false; 1667 while (it.hasNext()){ 1668 Annotation annFromList = (Annotation) it.next(); 1669 if (annFromList.getId().intValue() > ann.getId().intValue()){ 1670 annotationList.add(annotationList.indexOf(annFromList),ann); 1671 breaked = true; 1672 break; 1673 }// End if 1674 }// End while 1675 if (!breaked) 1676 annotationList.add(ann); 1677 iter.remove(); 1678 }// End while 1679 return annotationList; 1680 }// getAnnotationsForOffset() 1681 1682 private String writeStartTag(Annotation annot, boolean includeFeatures){ 1683 return writeStartTag(annot, includeFeatures, true); 1684 } // writeStartTag 1685 1686 /** Returns a string representing a start tag based on the input annot*/ 1687 private String writeStartTag(Annotation annot, boolean includeFeatures, 1688 boolean includeNamespace){ 1689 AnnotationSet originalMarkupsAnnotSet = 1690 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1691 1692 StringBuffer strBuff = new StringBuffer(""); 1693 if (annot == null) return strBuff.toString(); 1694// if (!addGatePreserveFormatTag && isRootTag){ 1695 if (theRootAnnotation != null && annot.getId().equals(theRootAnnotation.getId())){ 1696 //the features are included either if desired or if that's an annotation 1697 //from the original markup of the document. We don't want for example to 1698 //spoil all links in an HTML file! 1699 if (includeFeatures) { 1700 strBuff.append("<"); 1701 strBuff.append(annot.getType()); 1702 strBuff.append(" "); 1703 if(includeNamespace) { 1704 strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\""); 1705 strBuff.append(" gate:"); 1706 } 1707 strBuff.append("gateId=\""); 1708 strBuff.append(annot.getId()); 1709 strBuff.append("\""); 1710 strBuff.append(" "); 1711 if(includeNamespace) { 1712 strBuff.append("gate:"); 1713 } 1714 strBuff.append("annotMaxId=\""); 1715 strBuff.append(nextAnnotationId); 1716 strBuff.append("\""); 1717 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1718 strBuff.append(">"); 1719 } 1720 else if (originalMarkupsAnnotSet.contains(annot)) { 1721 strBuff.append("<"); 1722 strBuff.append(annot.getType()); 1723 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1724 strBuff.append(">"); 1725 } 1726 else { 1727 strBuff.append("<"); 1728 strBuff.append(annot.getType()); 1729 strBuff.append(">"); 1730 } 1731 1732 }else{ 1733 //the features are included either if desired or if that's an annotation 1734 //from the original markup of the document. We don't want for example to 1735 //spoil all links in an HTML file! 1736 if (includeFeatures) { 1737 strBuff.append("<"); 1738 strBuff.append(annot.getType()); 1739 strBuff.append(" "); 1740 if(includeNamespace) { 1741 strBuff.append("gate:"); 1742 } // if includeNamespaces 1743 strBuff.append("gateId=\""); 1744 strBuff.append(annot.getId()); 1745 strBuff.append("\""); 1746 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1747 strBuff.append(">"); 1748 } 1749 else if (originalMarkupsAnnotSet.contains(annot)) { 1750 strBuff.append("<"); 1751 strBuff.append(annot.getType()); 1752 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1753 strBuff.append(">"); 1754 } 1755 else { 1756 strBuff.append("<"); 1757 strBuff.append(annot.getType()); 1758 strBuff.append(">"); 1759 } 1760 }// End if 1761 return strBuff.toString(); 1762 }// writeStartTag() 1763 1764 /** 1765 * Identifies the root annotations inside an annotation set. 1766 * The root annotation is the one that starts at offset 0, and has the 1767 * greatest span. If there are more than one with this function, then the 1768 * annotation with the smalled ID wil be selected as root. 1769 * If none is identified it will return null. 1770 * @param anAnnotationSet The annotation set possibly containing 1771 * the root annotation. 1772 * @return The root annotation or null is it fails 1773 */ 1774 private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet){ 1775 if (anAnnotationSet == null) return null; 1776 // If the starting node of this annotation is not null, then the annotation 1777 // set will not have a root annotation. 1778 Node startNode = anAnnotationSet.firstNode(); 1779 Node endNode = anAnnotationSet.lastNode(); 1780 // This is placed here just to speed things up. The alghorithm bellow can 1781 // can identity the annotation that span over the entire set and with the 1782 // smallest ID. However the root annotation will have to have the start 1783 // offset equal to 0. 1784 if (startNode.getOffset().longValue() != 0) return null; 1785 // Go anf find the annotation. 1786 Annotation theRootAnnotation = null; 1787 // Check if there are annotations starting at offset 0. If there are, then 1788 // check all of them to see which one has the greatest span. Basically its 1789 // END offset should be the bigest offset from the input annotation set. 1790 long start = startNode.getOffset().longValue(); 1791 long end = endNode.getOffset().longValue(); 1792 for(Iterator it = anAnnotationSet.iterator(); it.hasNext();){ 1793 Annotation currentAnnot = (Annotation) it.next(); 1794 // If the currentAnnot has both its Start and End equals to the Start and 1795 // end of the AnnotationSet then check to see if its ID is the smallest. 1796 if ( 1797 (start == currentAnnot.getStartNode().getOffset().longValue()) && 1798 (end == currentAnnot.getEndNode().getOffset().longValue()) 1799 ){ 1800 // The currentAnnotation has is a potencial root one. 1801 if (theRootAnnotation == null) 1802 theRootAnnotation = currentAnnot; 1803 else{ 1804 // If its ID is greater that the currentAnnot then update the root 1805 if ( theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue()) 1806 theRootAnnotation = currentAnnot; 1807 }// End if 1808 }// End if 1809 }// End for 1810 return theRootAnnotation; 1811 }// End identifyTheRootAnnotation() 1812 1813 private Annotation identifyTheRootAnnotation(List anAnnotationList){ 1814 if (anAnnotationList == null || anAnnotationList.isEmpty()) return null; 1815 // If the first annotation in the list (which is sorted by start offset) 1816 //does not have an offset = 0, then there's no root tag. 1817 if(((Annotation)anAnnotationList.get(0)). 1818 getStartNode().getOffset().longValue() > 0) return null; 1819 1820 //find the limits 1821 long start = 0; //we know this already 1822 long end = 0; //end = 0 will be improved by the next loop 1823 for(int i = 0; i < anAnnotationList.size(); i++){ 1824 Annotation anAnnotation = (Annotation)anAnnotationList.get(i); 1825 long localEnd = anAnnotation.getEndNode().getOffset().longValue(); 1826 if(localEnd > end) end = localEnd; 1827 } 1828 1829 // Go and find the annotation. 1830 //look at all annotations that start at 0 and end at end 1831 //if there are several, choose the one with the smallest ID 1832 Annotation theRootAnnotation = null; 1833 for(int i = 0; i < anAnnotationList.size(); i++){ 1834 Annotation currentAnnot = (Annotation) anAnnotationList.get(i); 1835 long localStart = currentAnnot.getStartNode().getOffset().longValue(); 1836 long localEnd = currentAnnot.getEndNode().getOffset().longValue(); 1837 // If the currentAnnot has both its Start and End equals to the Start and 1838 // end of the AnnotationSet then check to see if its ID is the smallest. 1839 if ( 1840 (start == localStart) && (end == localEnd)){ 1841 // The currentAnnotation has is a potential root one. 1842 if (theRootAnnotation == null) theRootAnnotation = currentAnnot; 1843 else{ 1844 // If root's ID is greater that the currentAnnot then update the root 1845 if (theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue()) 1846 theRootAnnotation = currentAnnot; 1847 }// End if 1848 }// End if 1849 }// End for 1850 return theRootAnnotation; 1851 }// End identifyTheRootAnnotation() 1852 1853 1854 /** This method takes aScanString and searches for those chars from 1855 * entitiesMap that appear in the string. A tree map(offset2Char) is filled 1856 * using as key the offsets where those Chars appear and the Char. 1857 * If one of the params is null the method simply returns. 1858 */ 1859 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){ 1860 if (aScanString == null || aMapToFill == null) return; 1861 if (entitiesMap == null || entitiesMap.isEmpty()){ 1862 Err.prln("WARNING: Entities map was not initialised !"); 1863 return; 1864 }// End if 1865 // Fill the Map with the offsets of the special chars 1866 Iterator entitiesMapIterator = entitiesMap.keySet().iterator(); 1867 Character c; 1868 int fromIndex; 1869 while(entitiesMapIterator.hasNext()){ 1870 c = (Character) entitiesMapIterator.next(); 1871 fromIndex = 0; 1872 while (-1 != fromIndex){ 1873 fromIndex = aScanString.indexOf(c.charValue(),fromIndex); 1874 if (-1 != fromIndex){ 1875 aMapToFill.put(new Long(fromIndex),c); 1876 fromIndex ++; 1877 }// End if 1878 }// End while 1879 }// End while 1880 }//buildEntityMapFromString(); 1881 1882 private String writeEmptyTag(Annotation annot){ 1883 return writeEmptyTag(annot, true); 1884 } // writeEmptyTag 1885 1886 /** Returns a string representing an empty tag based on the input annot*/ 1887 private String writeEmptyTag(Annotation annot, boolean includeNamespace){ 1888 StringBuffer strBuff = new StringBuffer(""); 1889 if (annot == null) return strBuff.toString(); 1890 1891 strBuff.append("<"); 1892 strBuff.append(annot.getType()); 1893 1894 AnnotationSet originalMarkupsAnnotSet = 1895 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1896 if (! originalMarkupsAnnotSet.contains(annot)) { 1897 strBuff.append(" gateId=\""); 1898 strBuff.append(annot.getId()); 1899 strBuff.append("\""); 1900 } 1901 strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace)); 1902 strBuff.append("/>"); 1903 1904 return strBuff.toString(); 1905 }// writeEmptyTag() 1906 1907 /** Returns a string representing an end tag based on the input annot*/ 1908 private String writeEndTag(Annotation annot){ 1909 StringBuffer strBuff = new StringBuffer(""); 1910 if (annot == null) return strBuff.toString(); 1911/* 1912 if (annot.getType().indexOf(" ") != -1) 1913 Out.prln("Warning: Truncating end tag to first word for annot type \"" 1914 +annot.getType()+ "\". "); 1915*/ 1916 strBuff.append("</"+annot.getType()+">"); 1917 1918 return strBuff.toString(); 1919 }// writeEndTag() 1920 1921 /** Returns a string representing a FeatureMap serialized as XML attributes*/ 1922 private String writeFeatures(FeatureMap feat, boolean includeNamespace){ 1923 StringBuffer strBuff = new StringBuffer(""); 1924 if (feat == null) return strBuff.toString(); 1925 Iterator it = feat.keySet().iterator(); 1926 while (it.hasNext()){ 1927 Object key = it.next(); 1928 Object value = feat.get(key); 1929 if ( (key != null) && (value != null) ){ 1930 // Eliminate a feature inserted at reading time and which help to 1931 // take some decissions at saving time 1932 if ("isEmptyAndSpan".equals(key.toString())) 1933 continue; 1934 if( !(String.class.isAssignableFrom(key.getClass()) || 1935 Number.class.isAssignableFrom(key.getClass()))){ 1936 1937 Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+ 1938 " from String or Number.(feature discarded)"); 1939 continue; 1940 }// End if 1941 if ( !(String.class.isAssignableFrom(value.getClass()) || 1942 Number.class.isAssignableFrom(value.getClass()) || 1943 java.util.Collection.class.isAssignableFrom(value.getClass()))){ 1944 1945 Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+ 1946 " from String, Number or Collection.(feature discarded)"); 1947 continue; 1948 }// End if 1949 if ("matches".equals(key)) { 1950 strBuff.append(" "); 1951 if(includeNamespace) { 1952 strBuff.append("gate:"); 1953 } 1954// strBuff.append(key); 1955 // replace non XML chars in attribute name 1956 strBuff.append( 1957 filterNonXmlChars(replaceCharsWithEntities(key.toString()))); 1958 strBuff.append("=\""); 1959 } 1960 else { 1961 strBuff.append(" "); 1962// strBuff.append(key); 1963 // replace non XML chars in attribute name 1964 strBuff.append( 1965 filterNonXmlChars(replaceCharsWithEntities(key.toString()))); 1966 strBuff.append("=\""); 1967 } 1968 if (java.util.Collection.class.isAssignableFrom(value.getClass())){ 1969 Iterator valueIter = ((Collection)value).iterator(); 1970 while(valueIter.hasNext()){ 1971 Object item = valueIter.next(); 1972 if (!(String.class.isAssignableFrom(item.getClass()) || 1973 Number.class.isAssignableFrom(item.getClass()))) 1974 continue; 1975// strBuff.append(item); 1976 // replace non XML chars in collection item 1977 strBuff.append( 1978 filterNonXmlChars(replaceCharsWithEntities(item.toString()))); 1979 strBuff.append(";"); 1980 }// End while 1981 if (strBuff.charAt(strBuff.length()-1) == ';') 1982 strBuff.deleteCharAt(strBuff.length()-1); 1983 }else{ 1984// strBuff.append(value); 1985 // replace non XML chars in attribute value 1986 strBuff.append( 1987 filterNonXmlChars(replaceCharsWithEntities(value.toString()))); 1988 }// End if 1989 strBuff.append("\""); 1990 }// End if 1991 }// End while 1992 return strBuff.toString(); 1993 }// writeFeatures() 1994 1995 /** Returns a GateXml document that is a custom XML format for wich there is 1996 * a reader inside GATE called gate.xml.GateFormatXmlHandler. 1997 * What it does is to serialize a GATE document in an XML format. 1998 * @return a string representing a Gate Xml document. 1999 */ 2000 public String toXml(){ 2001 // Initialize the xmlContent with 3 time the size of the current document. 2002 // This is because of the tags size. This measure is made to increase the 2003 // performance of StringBuffer. 2004 StringBuffer xmlContent = new StringBuffer( 2005 DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue())); 2006 // Add xml header 2007 xmlContent.append("<?xml version=\"1.0\" encoding=\""); 2008 xmlContent.append(getEncoding()); 2009 xmlContent.append("\" ?>"); 2010 xmlContent.append(Strings.getNl()); 2011 2012 // Add the root element 2013 xmlContent.append("<GateDocument>\n"); 2014 xmlContent.append("<!-- The document's features-->\n\n"); 2015 xmlContent.append("<GateDocumentFeatures>\n"); 2016 2017 xmlContent.append(featuresToXml(this.getFeatures())); 2018 xmlContent.append("</GateDocumentFeatures>\n"); 2019 xmlContent.append("<!-- The document content area with serialized"+ 2020 " nodes -->\n\n"); 2021 // Add plain text element 2022 xmlContent.append("<TextWithNodes>"); 2023 xmlContent.append(textWithNodes(this.getContent().toString())); 2024 xmlContent.append("</TextWithNodes>\n"); 2025 // Serialize as XML all document's annotation sets 2026 // Serialize the default AnnotationSet 2027 StatusListener sListener = (StatusListener) 2028 gate.gui.MainFrame.getListeners(). 2029 get("gate.event.StatusListener"); 2030 if(sListener != null) 2031 sListener.statusChanged("Saving the default annotation set "); 2032 xmlContent.append("<!-- The default annotation set -->\n\n"); 2033 xmlContent.append(annotationSetToXml(this.getAnnotations())); 2034 // Serialize all others AnnotationSets 2035 // namedAnnotSets is a Map containing all other named Annotation Sets. 2036 if (namedAnnotSets != null){ 2037 Iterator iter = namedAnnotSets.values().iterator(); 2038 while(iter.hasNext()){ 2039 AnnotationSet annotSet = (AnnotationSet) iter.next(); 2040 xmlContent.append("<!-- Named annotation set -->\n\n"); 2041 // Serialize it as XML 2042 if(sListener != null) sListener.statusChanged("Saving " + 2043 annotSet.getName()+ 2044 " annotation set "); 2045 xmlContent.append(annotationSetToXml(annotSet)); 2046 }// End while 2047 }// End if 2048 // Add the end of GateDocument 2049 xmlContent.append("</GateDocument>"); 2050 if(sListener != null) sListener.statusChanged("Done !"); 2051 // return the XmlGateDocument 2052 return xmlContent.toString(); 2053 }// toXml 2054 2055 /** This method filters any non XML char 2056 * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets 2057 * All non XML chars will be replaced with 0x20 (space char) This assures 2058 * that the next time the document is loaded there won't be any problems. 2059 * @param aStrBuffer represents the input String that is filtred. If the 2060 * aStrBuffer is null then an empty string will be returend 2061 * @return the "purified" StringBuffer version of the aStrBuffer 2062 */ 2063 private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){ 2064 if (aStrBuffer == null) return new StringBuffer(""); 2065// String space = new String(" "); 2066 char space = ' '; 2067 for (int i=aStrBuffer.length()-1;i>=0; i--){ 2068 if (!isXmlChar(aStrBuffer.charAt(i))) 2069 aStrBuffer.setCharAt(i, space); 2070 }// End for 2071 return aStrBuffer; 2072 }// filterNonXmlChars() 2073 2074 /** This method decide if a char is a valid XML one or not 2075 * @param ch the char to be tested 2076 * @return true if is a valid XML char and fals if is not. 2077 */ 2078 public static boolean isXmlChar(char ch){ 2079 if (ch == 0x9 || ch == 0xA || ch ==0xD) return true; 2080 if ((0x20 <= ch) && (ch <= 0xD7FF)) return true; 2081 if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true; 2082 if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true; 2083 return false; 2084 }// End isXmlChar() 2085 2086 /** This method saves a FeatureMap as XML elements. 2087 * @ param aFeatureMap the feature map that has to be saved as XML. 2088 * @ return a String like this: <Feature><Name>...</Name> 2089 * <Value>...</Value></Feature><Feature>...</Feature> 2090 */ 2091 private String featuresToXml(FeatureMap aFeatureMap){ 2092 StringBuffer str = new StringBuffer(""); 2093 2094 if (aFeatureMap == null) return str.toString(); 2095 2096 Set keySet = aFeatureMap.keySet(); 2097 Iterator keyIterator = keySet.iterator(); 2098 while(keyIterator.hasNext()){ 2099 Object key = keyIterator.next(); 2100 Object value = aFeatureMap.get(key); 2101 if ((key != null) && (value != null)){ 2102 String keyClassName = null; 2103 String keyItemClassName = null; 2104 String valueClassName = null; 2105 String valueItemClassName = null; 2106 String key2String = key.toString(); 2107 String value2String = value.toString(); 2108 2109 Object item = null; 2110 // Test key if it is String, Number or Collection 2111 if (key instanceof java.lang.String || 2112 key instanceof java.lang.Number || 2113 key instanceof java.util.Collection) 2114 keyClassName = key.getClass().getName(); 2115 2116 // Test value if it is String, Number or Collection 2117 if (value instanceof java.lang.String || 2118 value instanceof java.lang.Number || 2119 value instanceof java.util.Collection) 2120 valueClassName = value.getClass().getName(); 2121 2122 // Features and values that are not Strings, Numbers or collections 2123 // will be discarded. 2124 if (keyClassName == null || valueClassName == null) continue; 2125 2126 // If key is collection serialize the colection in a specific format 2127 if (key instanceof java.util.Collection){ 2128 StringBuffer keyStrBuff = new StringBuffer(""); 2129 Iterator iter = ((Collection) key).iterator(); 2130 if (iter.hasNext()){ 2131 item = iter.next(); 2132 if (item instanceof java.lang.Number) 2133 keyItemClassName = item.getClass().getName(); 2134 else 2135 keyItemClassName = String.class.getName(); 2136 keyStrBuff.append(item.toString()); 2137 }// End if 2138 while (iter.hasNext()){ 2139 item = iter.next(); 2140 keyStrBuff.append(";" + item.toString()); 2141 }// End while 2142 key2String = keyStrBuff.toString(); 2143 }// End if 2144 // If key is collection serialize the colection in a specific format 2145 if (value instanceof java.util.Collection){ 2146 StringBuffer valueStrBuff = new StringBuffer(""); 2147 Iterator iter = ((Collection) value).iterator(); 2148 if (iter.hasNext()){ 2149 item = iter.next(); 2150 if (item instanceof java.lang.Number) 2151 valueItemClassName = item.getClass().getName(); 2152 else 2153 valueItemClassName = String.class.getName(); 2154 valueStrBuff.append(item.toString()); 2155 }// End if 2156 while (iter.hasNext()){ 2157 item = iter.next(); 2158 valueStrBuff.append(";" + item.toString()); 2159 }// End while 2160 value2String = valueStrBuff.toString(); 2161 }// End if 2162 str.append("<Feature>\n <Name"); 2163 if (keyClassName != null) 2164 str.append(" className=\""+keyClassName+"\""); 2165 if (keyItemClassName != null) 2166 str.append(" itemClassName=\""+keyItemClassName+"\""); 2167 str.append(">"); 2168 str.append(filterNonXmlChars(replaceCharsWithEntities(key2String))); 2169 str.append("</Name>\n <Value"); 2170 if (valueClassName != null) 2171 str.append(" className=\"" + valueClassName + "\""); 2172 if (valueItemClassName != null) 2173 str.append(" itemClassName=\"" + valueItemClassName + "\""); 2174 str.append(">"); 2175 str.append(filterNonXmlChars(replaceCharsWithEntities(value2String))); 2176 str.append("</Value>\n</Feature>\n"); 2177 }// End if 2178 }// end While 2179 return str.toString(); 2180 }//featuresToXml 2181 2182 /** This method replace all chars that appears in the anInputString and also 2183 * that are in the entitiesMap with their corresponding entity 2184 * @param anInputString the string analyzed. If it is null then returns the 2185 * empty string 2186 * @return a string representing the input string with chars replaced with 2187 * entities 2188 */ 2189 private StringBuffer replaceCharsWithEntities(String anInputString){ 2190 if (anInputString == null) return new StringBuffer(""); 2191 StringBuffer strBuff = new StringBuffer(anInputString); 2192 for (int i=strBuff.length()-1; i>=0; i--){ 2193 Character ch = new Character(strBuff.charAt(i)); 2194 if (entitiesMap.keySet().contains(ch)){ 2195 strBuff.replace(i,i+1,(String) entitiesMap.get(ch)); 2196 }// End if 2197 }// End for 2198 return strBuff; 2199 }//replaceCharsWithEntities() 2200 2201 /** This method creates Node XML elements and inserts them at the 2202 * corresponding offset inside the text. Nodes are created from the default 2203 * annotation set, as well as from all existing named annotation sets. 2204 * @param aText The text representing the document's plain text. 2205 * @return The text with empty <Node id="NodeId"/> elements. 2206 */ 2207 private String textWithNodes(String aText){ 2208 if (aText == null) return new String(""); 2209 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText)); 2210 2211 // Construct a map from offsets to Chars 2212 TreeMap offsets2CharsMap = new TreeMap(); 2213 if (aText.length()!= 0){ 2214 // Fill the offsets2CharsMap with all the indices where special chars appear 2215 buildEntityMapFromString(aText,offsets2CharsMap); 2216 }//End if 2217 // Construct the offsetsSet for all nodes belonging to this document 2218 TreeSet offsetsSet = new TreeSet(); 2219 Iterator annotSetIter = this.getAnnotations().iterator(); 2220 while (annotSetIter.hasNext()){ 2221 Annotation annot = (Annotation) annotSetIter.next(); 2222 offsetsSet.add(annot.getStartNode().getOffset()); 2223 offsetsSet.add(annot.getEndNode().getOffset()); 2224 }// end While 2225 // Get the nodes from all other named annotation sets. 2226 if (namedAnnotSets != null){ 2227 Iterator iter = namedAnnotSets.values().iterator(); 2228 while(iter.hasNext()){ 2229 AnnotationSet annotSet = (AnnotationSet) iter.next(); 2230 Iterator iter2 = annotSet.iterator(); 2231 while(iter2.hasNext()){ 2232 Annotation annotTmp = (Annotation) iter2.next(); 2233 offsetsSet.add(annotTmp.getStartNode().getOffset()); 2234 offsetsSet.add(annotTmp.getEndNode().getOffset()); 2235 }// End while 2236 }// End while 2237 }// End if 2238 // offsetsSet is ordered in ascending order because the structure 2239 // is a TreeSet 2240 2241 if (offsetsSet.isEmpty()){ 2242 return replaceCharsWithEntities(aText).toString(); 2243 }// End if 2244 // Iterate through all nodes from anAnnotSet and transform them to 2245 // XML elements. Then insert those elements at the node's offset into the 2246 // textWithNodes . 2247 while (!offsetsSet.isEmpty()){ 2248 Long offset = (Long) offsetsSet.last(); 2249 // Eliminate the offset from the list in order to create more memory space 2250 offsetsSet.remove(offset); 2251 // Use offset 2252 int offsetValue = offset.intValue(); 2253 String strNode = "<Node id=\"" + offsetValue + "\"/>"; 2254 // Before inserting this string into the textWithNodes, check to see if 2255 // there are any chars to be replaced with their corresponding entities 2256 if (!offsets2CharsMap.isEmpty()){ 2257 Long offsChar = (Long) offsets2CharsMap.lastKey(); 2258 while( !offsets2CharsMap.isEmpty() && 2259 offsChar.intValue() >= offset.intValue()){ 2260 // Replace the char at offsChar with its corresponding entity form 2261 // the entitiesMap. 2262 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 2263 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 2264 // Discard the offsChar after it was used because this offset will 2265 // never appear again 2266 offsets2CharsMap.remove(offsChar); 2267 // Investigate next offsChar 2268 if (!offsets2CharsMap.isEmpty()) 2269 offsChar = (Long) offsets2CharsMap.lastKey(); 2270 }// End while 2271 }// End if 2272 // Now it is safe to insert the node 2273 textWithNodes.insert(offsetValue,strNode); 2274 }// end while 2275 // Need to replace the entities in the remaining text, if there is any text 2276 // So, if there are any more items in offsets2CharsMap they need to be 2277 // replaced 2278 while (!offsets2CharsMap.isEmpty()){ 2279 Long offsChar = (Long) offsets2CharsMap.lastKey(); 2280 // Replace the char with its entity 2281 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 2282 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 2283 // remove the offset from the map 2284 offsets2CharsMap.remove(offsChar); 2285 }// End while 2286 return textWithNodes.toString(); 2287 }//textWithNodes() 2288 2289 /** This method saves an AnnotationSet as XML. 2290 * @param anAnnotationSet The annotation set that has to be saved as XML. 2291 * @return a String like this: <AnnotationSet> <Annotation>.... 2292 * </AnnotationSet> 2293 */ 2294 private String annotationSetToXml(AnnotationSet anAnnotationSet){ 2295 StringBuffer str = new StringBuffer(""); 2296 2297 if (anAnnotationSet == null){ 2298 str.append("<AnnotationSet>\n"); 2299 str.append("</AnnotationSet>\n"); 2300 return str.toString(); 2301 }// End if 2302 if (anAnnotationSet.getName() == null) 2303 str.append("<AnnotationSet>\n"); 2304 else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+ 2305 "\" >\n"); 2306 // Iterate through AnnotationSet and save each Annotation as XML 2307 Iterator iterator = anAnnotationSet.iterator(); 2308 while (iterator.hasNext()){ 2309 Annotation annot = (Annotation) iterator.next(); 2310 str.append("<Annotation " + "Type=\"" + annot.getType() + 2311 "\" StartNode=\"" + annot.getStartNode().getOffset() + 2312 "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n"); 2313 str.append(featuresToXml(annot.getFeatures())); 2314 str.append("</Annotation>\n"); 2315 }// End while 2316 2317 str.append("</AnnotationSet>\n"); 2318 return str.toString(); 2319 }// annotationSetToXml 2320 2321 /** Returns a map with the named annotation sets. It returns <code>null</code> 2322 * if no named annotaton set exists. */ 2323 public Map getNamedAnnotationSets() { 2324 return namedAnnotSets; 2325 } // getNamedAnnotationSets 2326 2327 /** 2328 * Removes one of the named annotation sets. 2329 * Note that the default annotation set cannot be removed. 2330 * @param name the name of the annotation set to be removed 2331 */ 2332 public void removeAnnotationSet(String name){ 2333 Object removed = namedAnnotSets.remove(name); 2334 if(removed != null){ 2335 fireAnnotationSetRemoved( 2336 new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name)); 2337 } 2338 } 2339 2340 /** Propagate edit changes to the document content and annotations. */ 2341 public void edit(Long start, Long end, DocumentContent replacement) 2342 throws InvalidOffsetException 2343 { 2344 if(! isValidOffsetRange(start, end)) 2345 throw new InvalidOffsetException(); 2346 2347 if(content != null) 2348 ((DocumentContentImpl) content).edit(start, end, replacement); 2349 2350 if(defaultAnnots != null) 2351 ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement); 2352 2353 if(namedAnnotSets != null) { 2354 Iterator iter = namedAnnotSets.values().iterator(); 2355 while(iter.hasNext()) 2356 ((AnnotationSetImpl) iter.next()).edit(start, end, replacement); 2357 } 2358 2359 } // edit(start,end,replacement) 2360 2361 /** Check that an offset is valid, i.e. it is non-null, greater than 2362 * or equal to 0 and less than the size of the document content. 2363 */ 2364 public boolean isValidOffset(Long offset) { 2365 if(offset == null) 2366 return false; 2367 2368 long o = offset.longValue(); 2369 if(o > getContent().size().longValue() || o < 0) 2370 return false; 2371 2372 return true; 2373 } // isValidOffset 2374 2375 /** Check that both start and end are valid offsets and that 2376 * they constitute a valid offset range, i.e. start is greater 2377 * than or equal to long. 2378 */ 2379 public boolean isValidOffsetRange(Long start, Long end) { 2380 return 2381 isValidOffset(start) && isValidOffset(end) && 2382 start.longValue() <= end.longValue(); 2383 } // isValidOffsetRange(start,end) 2384 2385 /** Sets the nextAnnotationId */ 2386 public void setNextAnnotationId(int aNextAnnotationId){ 2387 nextAnnotationId = aNextAnnotationId; 2388 }// setNextAnnotationId(); 2389 2390 /** Generate and return the next annotation ID */ 2391 public Integer getNextAnnotationId() { 2392 return new Integer(nextAnnotationId++); 2393 } // getNextAnnotationId 2394 2395 /** Generate and return the next node ID */ 2396 public Integer getNextNodeId() { return new Integer(nextNodeId++); } 2397 2398 /** Ordering based on URL.toString() and the URL offsets (if any) */ 2399 public int compareTo(Object o) throws ClassCastException { 2400 DocumentImpl other = (DocumentImpl) o; 2401 return getOrderingString().compareTo(other.getOrderingString()); 2402 } // compareTo 2403 2404 /** Utility method to produce a string for comparison in ordering. 2405 * String is based on the source URL and offsets. 2406 */ 2407 protected String getOrderingString() { 2408 if(sourceUrl == null) return toString(); 2409 2410 StringBuffer orderingString = new StringBuffer(sourceUrl.toString()); 2411 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) { 2412 orderingString.append(sourceUrlStartOffset.toString()); 2413 orderingString.append(sourceUrlEndOffset.toString()); 2414 } 2415 2416 return orderingString.toString(); 2417 } // getOrderingString() 2418 2419 /** The id of the next new annotation */ 2420 protected int nextAnnotationId = 0; 2421 2422 /** The id of the next new node */ 2423 protected int nextNodeId = 0; 2424 /** The source URL */ 2425 protected URL sourceUrl; 2426 2427 /** The document's URL name. */ 2428 2429 /** The content of the document */ 2430 protected DocumentContent content; 2431 2432 /** The encoding of the source of the document content */ 2433 protected String encoding = null; 2434 2435 // Data needed in toXml(AnnotationSet) methos 2436 2437 /** This field indicates whether or not to add the tag 2438 * called GatePreserveFormat to the document. HTML, XML, SGML docs won't 2439 * have this tag added 2440 */ 2441// private boolean addGatePreserveFormatTag = false; 2442 2443 /** 2444 * Used by the XML dump preserving format method 2445 */ 2446 private Annotation theRootAnnotation = null; 2447 2448 /** This field is used when creating StringBuffers for toXml() methods. 2449 * The size of the StringBuffer will be docDonctent.size() multiplied by this 2450 * value. It is aimed to improve the performance of StringBuffer 2451 */ 2452 private final int DOC_SIZE_MULTIPLICATION_FACTOR = 2; 2453 2454 /** Constant used in the inner class AnnotationComparator to order 2455 * annotations on their start offset 2456 */ 2457 private final int ORDER_ON_START_OFFSET = 0; 2458 /** Constant used in the inner class AnnotationComparator to order 2459 * annotations on their end offset 2460 */ 2461 private final int ORDER_ON_END_OFFSET = 1; 2462 /** Constant used in the inner class AnnotationComparator to order 2463 * annotations on their ID 2464 */ 2465 private final int ORDER_ON_ANNOT_ID = 2; 2466 /** Constant used in the inner class AnnotationComparator to order 2467 * annotations ascending 2468 */ 2469 private final int ASC = 3; 2470 /** Constant used in the inner class AnnotationComparator to order 2471 * annotations descending 2472 */ 2473 private final int DESC = -3; 2474 2475 /** A map initialized in init() containing entities that needs to be 2476 * replaced in strings 2477 */ 2478 private static Map entitiesMap = null; 2479 // Initialize the entities map use when saving as xml 2480 static{ 2481 entitiesMap = new HashMap(); 2482 entitiesMap.put(new Character('<'),"<"); 2483 entitiesMap.put(new Character('>'),">"); 2484 entitiesMap.put(new Character('&'),"&"); 2485 entitiesMap.put(new Character('\''),"'"); 2486 entitiesMap.put(new Character('"'),"""); 2487 entitiesMap.put(new Character((char)160)," "); 2488 entitiesMap.put(new Character((char)169),"©"); 2489 }//static 2490 2491 /** The range that the content comes from at the source URL 2492 * (or null if none). 2493 */ 2494 //protected Long[] sourceUrlOffsets; 2495 2496 /** The start of the range that the content comes from at the source URL 2497 * (or null if none). 2498 */ 2499 protected Long sourceUrlStartOffset; 2500 2501 /** The end of the range that the content comes from at the source URL 2502 * (or null if none). 2503 */ 2504 protected Long sourceUrlEndOffset; 2505 2506 /** The default annotation set */ 2507 protected AnnotationSet defaultAnnots; 2508 2509 /** Named sets of annotations */ 2510 protected Map namedAnnotSets; 2511 2512 /** 2513 * A property of the document that will be set when the user 2514 * wants to create the document from a string, as opposed to from 2515 * a URL. 2516 */ 2517 private String stringContent; 2518 2519 /** 2520 * The stringContent of a document is 2521 * a property of the document that will be set when the user 2522 * wants to create the document from a string, as opposed to from 2523 * a URL. 2524 * <B>Use the <TT>getContent</TT> method instead to get the actual document 2525 * content.</B> 2526 */ 2527 public String getStringContent() { return stringContent; } 2528 2529 /** 2530 * The stringContent of a document is 2531 * a property of the document that will be set when the user 2532 * wants to create the document from a string, as opposed to from 2533 * a URL. 2534 * <B>Use the <TT>setContent</TT> method instead to update the actual 2535 * document content.</B> 2536 */ 2537 public void setStringContent(String stringContent) { 2538 this.stringContent = stringContent; 2539 } // set StringContent 2540 2541 /** Is the document markup-aware? */ 2542 protected Boolean markupAware = new Boolean(false); 2543// /** Hash code */ 2544// public int hashCode() { 2545// int code = getContent().hashCode(); 2546// int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode(); 2547// code += memberCode; 2548// memberCode = (encoding == null) ? 0 : encoding.hashCode(); 2549// code += memberCode; 2550// memberCode = (features == null) ? 0 : features.hashCode(); 2551// code += memberCode; 2552// code += (markupAware.booleanValue()) ? 0 : 1; 2553// memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode(); 2554// code += memberCode; 2555// code += nextAnnotationId; 2556// code += nextNodeId; 2557// memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode(); 2558// code += memberCode; 2559// memberCode = 2560// (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode(); 2561// code += memberCode; 2562// memberCode = 2563// (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode(); 2564// code += memberCode; 2565// return code; 2566// } // hashcode 2567 2568 /** String respresentation */ 2569 public String toString() { 2570 String n = Strings.getNl(); 2571 StringBuffer s = new StringBuffer("DocumentImpl: " + n); 2572 s.append(" content:" + content + n); 2573 s.append(" defaultAnnots:" + defaultAnnots + n); 2574 s.append(" encoding:" + encoding + n); 2575 s.append(" features:" + features + n); 2576 s.append(" markupAware:" + markupAware + n); 2577 s.append(" namedAnnotSets:" + namedAnnotSets + n); 2578 s.append(" nextAnnotationId:" + nextAnnotationId + n); 2579 s.append(" nextNodeId:" + nextNodeId + n); 2580 s.append(" sourceUrl:" + sourceUrl + n); 2581 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n); 2582 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n); 2583 s.append(n); 2584 2585 return s.toString(); 2586 } // toString 2587 2588 /** Freeze the serialization UID. */ 2589 static final long serialVersionUID = -8456893608311510260L; 2590 2591 /** Inner class needed to compare annotations*/ 2592 class AnnotationComparator implements java.util.Comparator { 2593 int orderOn = -1; 2594 int orderType = ASC; 2595 /** Constructs a comparator according to one of three sorter types: 2596 * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET 2597 */ 2598 public AnnotationComparator(int anOrderOn, int anOrderType){ 2599 orderOn = anOrderOn; 2600 orderType = anOrderType; 2601 }// AnnotationComparator() 2602 2603 /**This method must be implemented according to Comparator interface */ 2604 public int compare(Object o1, Object o2){ 2605 Annotation a1 = (Annotation) o1; 2606 Annotation a2 = (Annotation) o2; 2607 // ORDER_ON_START_OFFSET ? 2608 if (orderOn == ORDER_ON_START_OFFSET){ 2609 int result = a1.getStartNode().getOffset().compareTo( 2610 a2.getStartNode().getOffset()); 2611 if (orderType == ASC){ 2612 // ASC 2613 // If they are equal then their ID will decide. 2614 if (result == 0) 2615 return a1.getId().compareTo(a2.getId()); 2616 return result; 2617 }else{ 2618 // DESC 2619 if (result == 0) 2620 return - (a1.getId().compareTo(a2.getId())); 2621 return -result; 2622 }// End if (orderType == ASC) 2623 }// End if (orderOn == ORDER_ON_START_OFFSET) 2624 2625 // ORDER_ON_END_OFFSET ? 2626 if (orderOn == ORDER_ON_END_OFFSET){ 2627 int result = a1.getEndNode().getOffset().compareTo( 2628 a2.getEndNode().getOffset()); 2629 if (orderType == ASC){ 2630 // ASC 2631 // If they are equal then their ID will decide. 2632 if (result == 0) 2633 return - (a1.getId().compareTo(a2.getId())); 2634 return result; 2635 }else{ 2636 // DESC 2637 // If they are equal then their ID will decide. 2638 if (result == 0) 2639 return a1.getId().compareTo(a2.getId()); 2640 return - result; 2641 }// End if (orderType == ASC) 2642 }// End if (orderOn == ORDER_ON_END_OFFSET) 2643 2644 // ORDER_ON_ANNOT_ID ? 2645 if (orderOn == ORDER_ON_ANNOT_ID){ 2646 if (orderType == ASC) 2647 return a1.getId().compareTo(a2.getId()); 2648 else 2649 return -(a1.getId().compareTo(a2.getId())); 2650 }// End if 2651 return 0; 2652 }//compare() 2653 } // End inner class AnnotationComparator 2654 2655 2656 private transient Vector documentListeners; 2657 private transient Vector gateListeners; 2658 2659 public synchronized void removeDocumentListener(DocumentListener l) { 2660 if (documentListeners != null && documentListeners.contains(l)) { 2661 Vector v = (Vector) documentListeners.clone(); 2662 v.removeElement(l); 2663 documentListeners = v; 2664 } 2665 } 2666 public synchronized void addDocumentListener(DocumentListener l) { 2667 Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone(); 2668 if (!v.contains(l)) { 2669 v.addElement(l); 2670 documentListeners = v; 2671 } 2672 } 2673 2674 protected void fireAnnotationSetAdded(DocumentEvent e) { 2675 if (documentListeners != null) { 2676 Vector listeners = documentListeners; 2677 int count = listeners.size(); 2678 for (int i = 0; i < count; i++) { 2679 ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e); 2680 } 2681 } 2682 } 2683 2684 protected void fireAnnotationSetRemoved(DocumentEvent e) { 2685 if (documentListeners != null) { 2686 Vector listeners = documentListeners; 2687 int count = listeners.size(); 2688 for (int i = 0; i < count; i++) { 2689 ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e); 2690 } 2691 } 2692 } 2693 public void resourceLoaded(CreoleEvent e) { 2694 } 2695 public void resourceUnloaded(CreoleEvent e) { 2696 } 2697 public void datastoreOpened(CreoleEvent e) { 2698 } 2699 public void datastoreCreated(CreoleEvent e) { 2700 } 2701 public void resourceRenamed(Resource resource, String oldName, 2702 String newName){ 2703 } 2704 public void datastoreClosed(CreoleEvent e) { 2705 if (! e.getDatastore().equals(this.getDataStore())) 2706 return; 2707 //close this lr, since it cannot stay open when the DS it comes from 2708 //is closed 2709 Factory.deleteResource(this); 2710 } 2711 public void setLRPersistenceId(Object lrID) { 2712 super.setLRPersistenceId( lrID); 2713 //make persistent documents listen to the creole register 2714 //for events about their DS 2715 Gate.getCreoleRegister().addCreoleListener(this); 2716 } 2717 public void resourceAdopted(DatastoreEvent evt) { 2718 } 2719 public void resourceDeleted(DatastoreEvent evt) { 2720 if(! evt.getSource().equals(this.getDataStore())) 2721 return; 2722 //if an open document is deleted from a DS, then 2723 //it must close itself immediately, as is no longer valid 2724 if(evt.getResourceID().equals(this.getLRPersistenceId())) 2725 Factory.deleteResource(this); 2726 } 2727 public void resourceWritten(DatastoreEvent evt) { 2728 } 2729 public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException { 2730 super.setDataStore( dataStore); 2731 if (this.dataStore != null) 2732 this.dataStore.addDatastoreListener(this); 2733 } 2734 2735} // class DocumentImpl 2736
|
DocumentImpl |
|