1
15
16 package gate.corpora;
17
18 import java.io.IOException;
19 import java.net.URL;
20 import java.util.*;
21
22 import gate.*;
23 import gate.annotation.AnnotationSetImpl;
24 import gate.creole.AbstractLanguageResource;
25 import gate.creole.ResourceInstantiationException;
26 import gate.event.*;
27 import gate.util.*;
28
29
131 public class DocumentImpl
132 extends AbstractLanguageResource implements TextualDocument, CreoleListener,
133 DatastoreListener {
134
135 private static final boolean DEBUG = false;
136
137
140 private Boolean preserveOriginalContent = new Boolean(false);
141
142
146 private Boolean collectRepositioningInfo = new Boolean(false);
147
148
153 private Annotation crossedOverAnnotation = null;
154
155
156 public DocumentImpl() {
157 content = new DocumentContentImpl();
158 stringContent = "";
159 }
161
162 public FeatureMap getFeatures() {
163 if (features == null) {
164 features = new SimpleFeatureMapImpl();
165 }
166 return features;
167 }
168
169
170 public Resource init() throws ResourceInstantiationException {
171 if(sourceUrl == null) {
173 if(stringContent == null) {
174 throw new ResourceInstantiationException(
175 "The sourceURL and document's content were null."
176 );
177 }
178
179 content = new DocumentContentImpl(stringContent);
180 getFeatures().put("gate.SourceURL", "created from String");
181 } else {
182 try {
183 content = new DocumentContentImpl(
184 sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
185 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
186 } catch(IOException e) {
187 e.printStackTrace();
188 throw new ResourceInstantiationException("DocumentImpl.init: " + e);
189 }
190
191 if(preserveOriginalContent.booleanValue() && content != null) {
192 String originalContent = new String(
193 ((DocumentContentImpl) content).getOriginalContent());
194 getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
195 originalContent);
196 } }
198
199 if(getMarkupAware().booleanValue()) {
201 DocumentFormat docFormat =
202 DocumentFormat.getDocumentFormat(this, sourceUrl);
203 try {
204 if(docFormat != null){
205 StatusListener sListener = (StatusListener)
206 gate.gui.MainFrame.getListeners().
207 get("gate.event.StatusListener");
208 if(sListener != null) docFormat.addStatusListener(sListener);
209
210 docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
212
213 if(docFormat.getShouldCollectRepositioning().booleanValue()) {
214 RepositioningInfo info = new RepositioningInfo();
216
217 String origContent = (String) getFeatures().get(
218 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
219
220 RepositioningInfo ampCodingInfo = new RepositioningInfo();
221 if(origContent != null) {
222 boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
223 collectInformationForAmpCodding(origContent, ampCodingInfo,
224 shouldCorrectCR);
225 if(docFormat instanceof HtmlDocumentFormat) {
226 collectInformationForWS(origContent, ampCodingInfo);
227 } }
230 docFormat.unpackMarkup(this, info, ampCodingInfo);
231
232 if(origContent != null
233 && docFormat instanceof XmlDocumentFormat) {
234 correctRepositioningForCRLFInXML(origContent, info);
236 }
238 getFeatures().put(
239 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
240 }
241 else {
242 docFormat.unpackMarkup(this);
244 }
245 docFormat.removeStatusListener(sListener);
246 } } catch(DocumentFormatException e) {
248 throw new ResourceInstantiationException(
249 "Couldn't unpack markup in document " + sourceUrl.toExternalForm() +
250 " " + e
251 );
252 }
253 }
255
264 return this;
265 }
267
270 private void correctRepositioningForCRLFInXML(String content,
271 RepositioningInfo info) {
272 int index = -1;
273
274 do {
275 index = content.indexOf("\r\n", index+1);
276 if(index != -1) {
277 info.correctInformationOriginalMove(index, 1);
278 } } while(index != -1);
280 }
282
295 private void collectInformationForAmpCodding(String content,
296 RepositioningInfo info,
297 boolean shouldCorrectCR) {
298
299 if(content == null || info == null) return;
300
301 int ampIndex = -1;
302 int semiIndex;
303
304 do {
305 ampIndex = content.indexOf('&', ampIndex+1);
306 if(ampIndex != -1) {
307 semiIndex = content.indexOf(';', ampIndex+1);
308 if(semiIndex != -1 && (semiIndex-ampIndex) < 8) {
310 info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1);
311 }
312 else {
313 int maxEnd = Math.min(ampIndex+8, content.length());
316 String ampCandidate = content.substring(ampIndex, maxEnd);
317 int ampCodingSize = analyseAmpCodding(ampCandidate);
318
319 if(ampCodingSize != -1) {
320 info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
321 }
323 } } } while (ampIndex != -1);
326
327 int index = -1;
330
331 if(shouldCorrectCR) {
332 do {
333 index = content.indexOf("\r\n", index+1);
334 if(index != -1) {
335 info.correctInformationOriginalMove(index, -1);
336 } } while(index != -1);
338 } }
341
345 private int analyseAmpCodding(String content) {
346 int result = -1;
347
348 try {
349 char ch = content.charAt(1);
350
351 switch(ch) {
352 case 'l' : case 'L' : if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
355 result = 3;
356 } break;
358 case 'g' : case 'G' : if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
361 result = 3;
362 } break;
364 case 'a' : case 'A' : if(content.substring(2, 4).equalsIgnoreCase("mp")) {
367 result = 4;
368 } break;
370 case 'q' : case 'Q' : if(content.substring(2, 5).equalsIgnoreCase("uot")) {
373 result = 5;
374 } break;
376 case '#' : int endIndex = 2;
378 boolean hexCoded = false;
379 if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
380 ++endIndex;
382 hexCoded = true;
383 }
385 while (endIndex < 8
386 && isNumber(content.charAt(endIndex), hexCoded) ) {
387 ++endIndex;
388 } result = endIndex;
390 break;
391 } } catch (StringIndexOutOfBoundsException ex) {
393 }
396 return result;
397 }
399
400 private boolean isNumber(char ch, boolean hex) {
401 if(ch >= '0' && ch <= '9') return true;
402
403 if(hex) {
404 if(ch >= 'A' && ch <= 'F') return true;
405 if(ch >= 'a' && ch <= 'f') return true;
406 }
408 return false;
409 }
411
417 private void collectInformationForWS(String content, RepositioningInfo info) {
418
419 if(content == null || info == null) return;
420
421 char ch;
423 int startWS, endWS;
424
425 startWS = endWS = -1;
426 int contentLength = content.length();
427
428 for(int i=0; i<contentLength; ++i) {
429 ch = content.charAt(i);
430
431 if(ch <= ' ') {
433 if(startWS == -1) {
434 startWS = i;
435 } endWS = i;
437 }
438 else {
439 if(endWS - startWS > 0) {
440 info.addPositionInfo(
442 (long)startWS, (long)(endWS - startWS + 1), 0, 1);
443 } startWS = endWS = -1;
446 } } }
450
451 public void cleanup() {
452
453 defaultAnnots = null;
454 if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
455 namedAnnotSets.clear();
456 if (DEBUG) Out.prln("Document cleanup called");
457 if (this.lrPersistentId != null)
458 Gate.getCreoleRegister().removeCreoleListener(this);
459 if(this.getDataStore() != null)
460 this.getDataStore().removeDatastoreListener(this);
461 }
463
464
465 public URL getSourceUrl() { return sourceUrl; }
466
467
468 public void setSourceUrl(URL sourceUrl) {
469 this.sourceUrl = sourceUrl;
470 }
472
475 public Long[] getSourceUrlOffsets() {
476 Long[] sourceUrlOffsets = new Long[2];
477 sourceUrlOffsets[0] = sourceUrlStartOffset;
478 sourceUrlOffsets[1] = sourceUrlEndOffset;
479 return sourceUrlOffsets;
480 }
482
487 public void setPreserveOriginalContent(Boolean b) {
488 preserveOriginalContent = b;
489 }
491
495 public Boolean getPreserveOriginalContent() {
496 return preserveOriginalContent;
497 }
499
507 public void setCollectRepositioningInfo(Boolean b) {
508 collectRepositioningInfo = b;
509 }
511
519 public Boolean getCollectRepositioningInfo() {
520 return collectRepositioningInfo;
521 }
523
527 public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
528
529
533 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
534 this.sourceUrlStartOffset = sourceUrlStartOffset;
535 }
537
541 public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
542
543
547 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
548 this.sourceUrlEndOffset = sourceUrlEndOffset;
549 }
551
552 public DocumentContent getContent() { return content; }
553
554
555 public void setContent(DocumentContent content) {
556 this.content = content;
557 this.stringContent = content.toString();
558 }
559
560
561 public String getEncoding() {
562 if(encoding == null || encoding.trim().length() == 0){
564 encoding = java.nio.charset.Charset.forName(
566 System.getProperty("file.encoding")).name();
567 }
568 return encoding;
569 }
570
571
572 public void setEncoding(String encoding) { this.encoding = encoding; }
573
574
577 public AnnotationSet getAnnotations() {
578 if(defaultAnnots == null){
579 defaultAnnots = new AnnotationSetImpl(this);
580 fireAnnotationSetAdded(new DocumentEvent(
581 this, DocumentEvent.ANNOTATION_SET_ADDED, null));
582 } return defaultAnnots;
584 }
586
590 public AnnotationSet getAnnotations(String name) {
591 if(name == null) return getAnnotations();
592 if(namedAnnotSets == null)
593 namedAnnotSets = new HashMap();
594 AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
595
596 if(namedSet == null) {
597 namedSet = new AnnotationSetImpl(this, name);
598 namedAnnotSets.put(name, namedSet);
599
600 DocumentEvent evt = new DocumentEvent(
601 this, DocumentEvent.ANNOTATION_SET_ADDED, name
602 );
603 fireAnnotationSetAdded(evt);
604 }
605 return namedSet;
606 }
608
615 public void setMarkupAware(Boolean newMarkupAware) {
616 this.markupAware = newMarkupAware;
617 }
618
619
623 public Boolean getMarkupAware() { return markupAware; }
624
625
631 public String toXml(Set aSourceAnnotationSet){
632 return toXml(aSourceAnnotationSet, true);
633 }
634
635
650 public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){
651
652 if(hasOriginalContentFeatures()) {
653 return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures);
654 }
656 AnnotationSet originalMarkupsAnnotSet =
657 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
658
659 List dumpingList = new ArrayList(originalMarkupsAnnotSet.size());
663
664
672 StatusListener sListener = (StatusListener)
673 gate.gui.MainFrame.getListeners().
674 get("gate.event.StatusListener");
675 if(sListener != null)
679 sListener.statusChanged("Constructing the dumping annotation set.");
680 dumpingList.addAll(originalMarkupsAnnotSet);
682 if (aSourceAnnotationSet != null){
686 Iterator iter = aSourceAnnotationSet.iterator();
687 while (iter.hasNext()){
688 Annotation currentAnnot = (Annotation) iter.next();
689 if(insertsSafety(dumpingList,currentAnnot)){
690 dumpingList.add(currentAnnot);
692 }else if (crossedOverAnnotation != null && DEBUG){
693 try {
694 Out.prln("Warning: Annotations were found to violate the " +
695 "crossed over condition: \n" +
696 "1. [" +
697 getContent().getContent(
698 crossedOverAnnotation.getStartNode().getOffset(),
699 crossedOverAnnotation.getEndNode().getOffset()) +
700 " (" + crossedOverAnnotation.getType() + ": " +
701 crossedOverAnnotation.getStartNode().getOffset() +
702 ";" + crossedOverAnnotation.getEndNode().getOffset() +
703 ")]\n" +
704 "2. [" +
705 getContent().getContent(
706 currentAnnot.getStartNode().getOffset(),
707 currentAnnot.getEndNode().getOffset()) +
708 " (" + currentAnnot.getType() + ": " +
709 currentAnnot.getStartNode().getOffset() +
710 ";" + currentAnnot.getEndNode().getOffset() +
711 ")]\nThe second one will be discarded.\n" );
712 } catch (gate.util.InvalidOffsetException ex) {
713 throw new GateRuntimeException(ex.getMessage());
714 }
715 } } }
719 Collections.sort(dumpingList, new gate.util.OffsetComparator());
721
722 if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
725 StringBuffer xmlDoc = new StringBuffer(
726 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
727
728 String mimeType = getFeatures() == null ?
730 null :
731 (String)getFeatures().get("MimeType");
732 boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
733
734 if(wasXML){
735 xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
736 xmlDoc.append(getEncoding());
737 xmlDoc.append("\" ?>");
738 xmlDoc.append(Strings.getNl());
739 } theRootAnnotation = identifyTheRootAnnotation(dumpingList);
742 if (theRootAnnotation != null){
745 dumpingList.remove(theRootAnnotation);
746 xmlDoc.append(writeStartTag(theRootAnnotation,includeFeatures));
747 } xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
750 if (theRootAnnotation != null){
753 xmlDoc.append(writeEndTag(theRootAnnotation));
754 }
756 if(sListener != null) sListener.statusChanged("Done.");
757 return xmlDoc.toString();
758 }
760
768 private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
769 Annotation aSourceAnnotation){
770
771 if (aTargetAnnotSet == null || aSourceAnnotation == null) {
772 this.crossedOverAnnotation = null;
773 return false;
774 }
775 if (aSourceAnnotation.getStartNode() == null ||
776 aSourceAnnotation.getStartNode().getOffset()== null) {
777 this.crossedOverAnnotation = null;
778 return false;
779 }
780 if (aSourceAnnotation.getEndNode() == null ||
781 aSourceAnnotation.getEndNode().getOffset()== null) {
782 this.crossedOverAnnotation = null;
783 return false;
784 }
785
786 Long start = aSourceAnnotation.getStartNode().getOffset();
788 Long end = aSourceAnnotation.getEndNode().getOffset();
789 long s2 = start.longValue();
791 long e2 = end.longValue();
792
793 AnnotationSet as = aTargetAnnotSet.get(start,end);
796
797 Iterator it = as.iterator();
800 while(it.hasNext()){
801 Annotation ann = (Annotation) it.next();
802 long s1 = ann.getStartNode().getOffset().longValue();
804 long e1 = ann.getEndNode().getOffset().longValue();
805
806 if (s1<s2 && s2<e1 && e1<e2) {
807 this.crossedOverAnnotation = ann;
808 return false;
809 }
810 if (s2<s1 && s1<e2 && e2<e1) {
811 this.crossedOverAnnotation = ann;
812 return false;
813 }
814 } return true;
816 }
818 private boolean insertsSafety(List aTargetAnnotList,
819 Annotation aSourceAnnotation){
820
821 if (aTargetAnnotList == null || aSourceAnnotation == null) {
822 this.crossedOverAnnotation = null;
823 return false;
824 }
825 if (aSourceAnnotation.getStartNode() == null ||
826 aSourceAnnotation.getStartNode().getOffset()== null) {
827 this.crossedOverAnnotation = null;
828 return false;
829 }
830 if (aSourceAnnotation.getEndNode() == null ||
831 aSourceAnnotation.getEndNode().getOffset()== null) {
832 this.crossedOverAnnotation = null;
833 return false;
834 }
835
836 Long start = aSourceAnnotation.getStartNode().getOffset();
838 Long end = aSourceAnnotation.getEndNode().getOffset();
839 long s2 = start.longValue();
841 long e2 = end.longValue();
842
843 List as = new ArrayList();
846 for (int i=0; i < aTargetAnnotList.size(); i++) {
847 Annotation annot = (Annotation) aTargetAnnotList.get(i);
848 if (annot.getStartNode().getOffset().longValue() >= s2
849 &&
850 annot.getStartNode().getOffset().longValue() <= e2)
851 as.add(annot);
852 else if (annot.getEndNode().getOffset().longValue() >= s2
853 &&
854 annot.getEndNode().getOffset().longValue() <= e2)
855 as.add(annot);
856 }
857
858 Iterator it = as.iterator();
861 while(it.hasNext()){
862 Annotation ann = (Annotation) it.next();
863 long s1 = ann.getStartNode().getOffset().longValue();
865 long e1 = ann.getEndNode().getOffset().longValue();
866
867 if (s1<s2 && s2<e1 && e1<e2) {
868 this.crossedOverAnnotation = ann;
869 return false;
870 }
871 if (s2<s1 && s1<e2 && e2<e1) {
872 this.crossedOverAnnotation = ann;
873 return false;
874 }
875 } return true;
877 }
879
889 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
890 boolean includeFeatures){
891 String content = null;
892 if (this.getContent()== null)
893 content = new String("");
894 else
895 content = this.getContent().toString();
896 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
897 if (aDumpAnnotSet == null) return docContStrBuff.toString();
898
899 TreeMap offsets2CharsMap = new TreeMap();
900 if (this.getContent().size().longValue() != 0){
901 buildEntityMapFromString(content,offsets2CharsMap);
904 }
911 TreeSet offsets = new TreeSet();
913 Iterator iter = aDumpAnnotSet.iterator();
914 while (iter.hasNext()){
915 Annotation annot = (Annotation) iter.next();
916 offsets.add(annot.getStartNode().getOffset());
917 offsets.add(annot.getEndNode().getOffset());
918 }
920 while (!offsets.isEmpty()){
924 Long offset = (Long)offsets.last();
925 offsets.remove(offset);
927 List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
931 StringBuffer tmpBuff = new StringBuffer(
934 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
935 Stack stack = new Stack();
936 Iterator it = annotations.iterator();
938 while(it.hasNext()){
939 Annotation a = (Annotation) it.next();
940 it.remove();
941 if ( offset.equals(a.getEndNode().getOffset()) ){
943 if ( offset.equals(a.getStartNode().getOffset()) ){
945 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
947 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
948
949 tmpBuff.append(writeStartTag(a, includeFeatures));
951 stack.push(a);
952 }else{
953 tmpBuff.append(writeEmptyTag(a));
955 aDumpAnnotSet.remove(a);
957 } }else{
959 if (!stack.isEmpty()){
962 while(!stack.isEmpty()){
963 Annotation a1 = (Annotation)stack.pop();
964 tmpBuff.append(writeEndTag(a1));
965 } } tmpBuff.append(writeEndTag(a));
968 } }else{
970 if ( offset.equals(a.getStartNode().getOffset()) ){
973 if (!stack.isEmpty()){
976 while(!stack.isEmpty()){
977 Annotation a1 = (Annotation)stack.pop();
978 tmpBuff.append(writeEndTag(a1));
979 } } tmpBuff.append(writeStartTag(a, includeFeatures));
982 aDumpAnnotSet.remove(a);
984 } } }
988 if (!stack.isEmpty()){
990 while(!stack.isEmpty()){
991 Annotation a1 = (Annotation)stack.pop();
992 tmpBuff.append(writeEndTag(a1));
993 } }
996 if (!offsets2CharsMap.isEmpty()){
1000 Long offsChar = (Long) offsets2CharsMap.lastKey();
1001 while( !offsets2CharsMap.isEmpty() &&
1002 offsChar.intValue() >= offset.intValue()){
1003 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1006 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1007 offsets2CharsMap.remove(offsChar);
1009 if (!offsets2CharsMap.isEmpty())
1011 offsChar = (Long) offsets2CharsMap.lastKey();
1012 } } docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1016 } while (!offsets2CharsMap.isEmpty()){
1021 Long offsChar = (Long) offsets2CharsMap.lastKey();
1022 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1024 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1025 offsets2CharsMap.remove(offsChar);
1027 } return docContStrBuff.toString();
1029 }
1031 private String saveAnnotationSetAsXml(List aDumpAnnotList,
1032 boolean includeFeatures){
1033 String content = null;
1034 if (this.getContent()== null)
1035 content = new String("");
1036 else
1037 content = this.getContent().toString();
1038 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1039 if (aDumpAnnotList == null) return docContStrBuff.toString();
1040
1041 StringBuffer resultStrBuff = new StringBuffer(
1042 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1043 Long lastOffset = new Long(0);
1045
1046 TreeMap offsets2CharsMap = new TreeMap();
1047 HashMap annotsForOffset = new HashMap(100);
1048 if (this.getContent().size().longValue() != 0){
1049 buildEntityMapFromString(content,offsets2CharsMap);
1052 }
1059 TreeSet offsets = new TreeSet();
1061 Iterator iter = aDumpAnnotList.iterator();
1062 Annotation annot;
1063 Long start;
1064 Long end;
1065 while (iter.hasNext()){
1066 annot = (Annotation) iter.next();
1067 start = annot.getStartNode().getOffset();
1068 end = annot.getEndNode().getOffset();
1069 offsets.add(start);
1070 offsets.add(end);
1071 if (annotsForOffset.containsKey(start)) {
1072 ((List) annotsForOffset.get(start)).add(annot);
1073 } else {
1074 List newList = new ArrayList(10);
1075 newList.add(annot);
1076 annotsForOffset.put(start, newList);
1077 }
1078 if (annotsForOffset.containsKey(end)) {
1079 ((List) annotsForOffset.get(end)).add(annot);
1080 } else {
1081 List newList = new ArrayList(10);
1082 newList.add(annot);
1083 annotsForOffset.put(end, newList);
1084 }
1085 }
1087 Iterator offsetIt = offsets.iterator();
1091 Long offset;
1092 List annotations;
1093 StringBuffer tmpBuff = new StringBuffer(255);
1095 Stack stack = new Stack();
1096 while (offsetIt.hasNext()){
1097 offset = (Long)offsetIt.next();
1098 annotations = (List) annotsForOffset.get(offset);
1102 annotations = getAnnotationsForOffset(annotations, offset);
1104 tmpBuff.setLength(0);
1106 stack.clear();
1107
1108 Iterator it = annotations.iterator();
1110 Annotation a;
1111 Annotation annStack;
1112 while(it.hasNext()){
1113 a = (Annotation) it.next();
1114 if ( offset.equals(a.getEndNode().getOffset()) ){
1116 if ( offset.equals(a.getStartNode().getOffset()) ){
1118 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1120 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1121
1122 tmpBuff.append(writeStartTag(a, includeFeatures));
1124 stack.push(a);
1125 }else{
1126 tmpBuff.append(writeEmptyTag(a));
1128 aDumpAnnotList.remove(a);
1130 } }else{
1132 if (!stack.isEmpty()){
1135 while(!stack.isEmpty()){
1136 annStack = (Annotation)stack.pop();
1137 tmpBuff.append(writeEndTag(annStack));
1138 } } tmpBuff.append(writeEndTag(a));
1141 } }else{
1143 if ( offset.equals(a.getStartNode().getOffset()) ){
1146 if (!stack.isEmpty()){
1149 while(!stack.isEmpty()){
1150 annStack = (Annotation)stack.pop();
1151 tmpBuff.append(writeEndTag(annStack));
1152 } } tmpBuff.append(writeStartTag(a, includeFeatures));
1155 } } }
1160 if (!stack.isEmpty()){
1162 while(!stack.isEmpty()){
1163 annStack = (Annotation)stack.pop();
1164 tmpBuff.append(writeEndTag(annStack));
1165 } }
1168 StringBuffer partText = new StringBuffer();
1170 SortedMap offsetsInRange =
1171 offsets2CharsMap.subMap(lastOffset, offset);
1172 Long tmpOffset;
1173 Long tmpLastOffset = lastOffset;
1174 String replacement;
1175
1176 if(!offsetsInRange.isEmpty()) {
1179 tmpOffset = (Long) offsetsInRange.firstKey();
1180 replacement =
1181 (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1182 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1183 tmpOffset.intValue()));
1184 partText.append(replacement);
1185 tmpLastOffset = new Long(tmpOffset.longValue()+1);
1186 }
1187 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1188 offset.intValue()));
1189 resultStrBuff.append(partText);
1190 resultStrBuff.append(tmpBuff.toString());
1192 lastOffset = offset;
1193 }
1195 StringBuffer partText = new StringBuffer();
1198 SortedMap offsetsInRange =
1199 offsets2CharsMap.subMap(lastOffset, new Long(docContStrBuff.length()));
1200 Long tmpOffset;
1201 Long tmpLastOffset = lastOffset;
1202 String replacement;
1203
1204 if(!offsetsInRange.isEmpty()) {
1208 tmpOffset = (Long) offsetsInRange.firstKey();
1209 replacement =
1210 (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1211 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1212 tmpOffset.intValue()));
1213 partText.append(replacement);
1214 tmpLastOffset = new Long(tmpOffset.longValue()+1);
1215 }
1216 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1217 docContStrBuff.length()));
1218 resultStrBuff.append(partText);
1219
1220 return resultStrBuff.toString();
1221 }
1223
1384
1385
1389 private boolean hasOriginalContentFeatures() {
1390 FeatureMap features = getFeatures();
1391 boolean result = false;
1392
1393 result =
1394 (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
1395 &&
1396 (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME)
1397 != null);
1398
1399 return result;
1400 }
1402
1412 private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
1413 boolean includeFeatures){
1414 StringBuffer docContStrBuff;
1415
1416 String origContent;
1417
1418 origContent =
1419 (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
1420 if(origContent == null) {
1421 origContent = "";
1422 }
1424 long originalContentSize = origContent.length();
1425
1426 RepositioningInfo repositioning = (RepositioningInfo)
1427 getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
1428
1429 docContStrBuff = new StringBuffer(origContent);
1430 if (aSourceAnnotationSet == null) return docContStrBuff.toString();
1431
1432 StatusListener sListener = (StatusListener)
1433 gate.gui.MainFrame.getListeners().
1434 get("gate.event.StatusListener");
1435
1436 AnnotationSet originalMarkupsAnnotSet =
1437 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1438 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
1441 if(sListener != null)
1442 sListener.statusChanged("Constructing the dumping annotation set.");
1443 if (aSourceAnnotationSet != null){
1447 Iterator iter = aSourceAnnotationSet.iterator();
1448 Annotation currentAnnot;
1449 while (iter.hasNext()){
1450 currentAnnot = (Annotation) iter.next();
1451 if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1452 && insertsSafety(dumpingSet, currentAnnot)){
1453 dumpingSet.add(currentAnnot);
1454 }else{
1455 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
1456 ", startOffset=" + currentAnnot.getStartNode().getOffset() +
1457 ", endOffset=" + currentAnnot.getEndNode().getOffset() +
1458 ", type=" + currentAnnot.getType()+ " was found to violate the" +
1459 " crossed over condition. It will be discarded");
1460 } } }
1464 if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
1467
1468
1473 TreeSet offsets = new TreeSet();
1475 Iterator iter = aSourceAnnotationSet.iterator();
1476 while (iter.hasNext()){
1477 Annotation annot = (Annotation) iter.next();
1478 offsets.add(annot.getStartNode().getOffset());
1479 offsets.add(annot.getEndNode().getOffset());
1480 }
1482 while (!offsets.isEmpty()){
1486 Long offset = (Long)offsets.last();
1487 offsets.remove(offset);
1489 List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset);
1493 StringBuffer tmpBuff = new StringBuffer("");
1495 Stack stack = new Stack();
1496 Iterator it = annotations.iterator();
1498 Annotation a = null;
1499 while(it.hasNext()) {
1500 a = (Annotation) it.next();
1501 it.remove();
1502 if ( offset.equals(a.getEndNode().getOffset()) ){
1504 if ( offset.equals(a.getStartNode().getOffset()) ){
1506 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1508 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1509
1510 tmpBuff.append(writeStartTag(a, includeFeatures, false));
1512 stack.push(a);
1513 }else{
1514 tmpBuff.append(writeEmptyTag(a, false));
1516 aSourceAnnotationSet.remove(a);
1518 } }else{
1520 while(!stack.isEmpty()){
1523 Annotation a1 = (Annotation)stack.pop();
1524 tmpBuff.append(writeEndTag(a1));
1525 } tmpBuff.append(writeEndTag(a));
1527 } }else{
1529 if ( offset.equals(a.getStartNode().getOffset()) ){
1532 while(!stack.isEmpty()){
1535 Annotation a1 = (Annotation)stack.pop();
1536 tmpBuff.append(writeEndTag(a1));
1537 }
1539 tmpBuff.append(writeStartTag(a, includeFeatures, false));
1540 aSourceAnnotationSet.remove(a);
1542 } } }
1546 while(!stack.isEmpty()){
1548 Annotation a1 = (Annotation)stack.pop();
1549 tmpBuff.append(writeEndTag(a1));
1550 }
1552 long originalPosition = -1;
1553 boolean backPositioning =
1554 a != null && offset.equals(a.getEndNode().getOffset());
1555 if ( backPositioning ) {
1556 originalPosition =
1558 repositioning.getOriginalPos(offset.intValue(), true);
1559 }
1561 if(originalPosition == -1) {
1562 originalPosition = repositioning.getOriginalPos(offset.intValue());
1563 }
1565 if(originalPosition != -1 && originalPosition <= originalContentSize ) {
1567 docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
1568 }
1569 else {
1570 Out.prln("Error in the repositioning. The offset ("+offset.intValue()
1571 +") could not be positioned in the original document. \n"
1572 +"Calculated position is: "+originalPosition
1573 +" placed back: "+backPositioning);
1574 }
1576 } if (theRootAnnotation != null)
1578 docContStrBuff.append(writeEndTag(theRootAnnotation));
1579 return docContStrBuff.toString();
1580 }
1582
1591 private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){
1592 List annotationList = new LinkedList();
1593 if (aDumpAnnotSet == null || offset == null) return annotationList;
1594 Set annotThatStartAtOffset = new TreeSet(
1595 new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
1596 Set annotThatEndAtOffset = new TreeSet(
1597 new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
1598 Set annotThatStartAndEndAtOffset = new TreeSet(
1599 new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
1600
1601 Iterator iter = aDumpAnnotSet.iterator();
1604 while(iter.hasNext()){
1605 Annotation ann = (Annotation) iter.next();
1606 if (offset.equals(ann.getStartNode().getOffset())){
1607 if (offset.equals(ann.getEndNode().getOffset()))
1608 annotThatStartAndEndAtOffset.add(ann);
1609 else
1610 annotThatStartAtOffset.add(ann);
1611 }else{
1612 if (offset.equals(ann.getEndNode().getOffset()))
1613 annotThatEndAtOffset.add(ann);
1614 } } annotationList.addAll(annotThatEndAtOffset);
1617 annotThatEndAtOffset = null;
1618 annotationList.addAll(annotThatStartAtOffset);
1619 annotThatStartAtOffset = null;
1620 iter = annotThatStartAndEndAtOffset.iterator();
1621 while(iter.hasNext()){
1622 Annotation ann = (Annotation) iter.next();
1623 Iterator it = annotationList.iterator();
1624 boolean breaked = false;
1625 while (it.hasNext()){
1626 Annotation annFromList = (Annotation) it.next();
1627 if (annFromList.getId().intValue() > ann.getId().intValue()){
1628 annotationList.add(annotationList.indexOf(annFromList),ann);
1629 breaked = true;
1630 break;
1631 } } if (!breaked)
1634 annotationList.add(ann);
1635 iter.remove();
1636 } return annotationList;
1638 }
1640 private List getAnnotationsForOffset(List aDumpAnnotList, Long offset){
1641 List annotationList = new ArrayList();
1642 if (aDumpAnnotList == null || offset == null) return annotationList;
1643 Set annotThatStartAtOffset;
1644 Set annotThatEndAtOffset;
1645 Set annotThatStartAndEndAtOffset;
1646 annotThatStartAtOffset = new TreeSet(
1647 new AnnotationComparator(ORDER_ON_END_OFFSET, DESC));
1648 annotThatEndAtOffset = new TreeSet(
1649 new AnnotationComparator(ORDER_ON_START_OFFSET, DESC));
1650 annotThatStartAndEndAtOffset = new TreeSet(
1651 new AnnotationComparator(ORDER_ON_ANNOT_ID, ASC));
1652
1653 Iterator iter = aDumpAnnotList.iterator();
1656 while(iter.hasNext()){
1657 Annotation ann = (Annotation) iter.next();
1658 if (offset.equals(ann.getStartNode().getOffset())){
1659 if (offset.equals(ann.getEndNode().getOffset()))
1660 annotThatStartAndEndAtOffset.add(ann);
1661 else
1662 annotThatStartAtOffset.add(ann);
1663 }else{
1664 if (offset.equals(ann.getEndNode().getOffset()))
1665 annotThatEndAtOffset.add(ann);
1666 } }
1669 annotationList.addAll(annotThatEndAtOffset);
1670 annotationList.addAll(annotThatStartAtOffset);
1671 annotThatEndAtOffset = null;
1672 annotThatStartAtOffset = null;
1673
1674 iter = annotThatStartAndEndAtOffset.iterator();
1675 while(iter.hasNext()){
1676 Annotation ann = (Annotation) iter.next();
1677 Iterator it = annotationList.iterator();
1678 boolean breaked = false;
1679 while (it.hasNext()){
1680 Annotation annFromList = (Annotation) it.next();
1681 if (annFromList.getId().intValue() > ann.getId().intValue()){
1682 annotationList.add(annotationList.indexOf(annFromList),ann);
1683 breaked = true;
1684 break;
1685 } } if (!breaked)
1688 annotationList.add(ann);
1689 iter.remove();
1690 } return annotationList;
1692 }
1694 private String writeStartTag(Annotation annot, boolean includeFeatures){
1695 return writeStartTag(annot, includeFeatures, true);
1696 }
1698
1699 private String writeStartTag(Annotation annot, boolean includeFeatures,
1700 boolean includeNamespace){
1701 AnnotationSet originalMarkupsAnnotSet =
1702 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1703
1704 StringBuffer strBuff = new StringBuffer("");
1705 if (annot == null) return strBuff.toString();
1706 if (theRootAnnotation != null && annot.getId().equals(theRootAnnotation.getId())){
1708 if (includeFeatures) {
1712 strBuff.append("<");
1713 strBuff.append(annot.getType());
1714 strBuff.append(" ");
1715 if(includeNamespace) {
1716 strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\"");
1717 strBuff.append(" gate:");
1718 }
1719 strBuff.append("gateId=\"");
1720 strBuff.append(annot.getId());
1721 strBuff.append("\"");
1722 strBuff.append(" ");
1723 if(includeNamespace) {
1724 strBuff.append("gate:");
1725 }
1726 strBuff.append("annotMaxId=\"");
1727 strBuff.append(nextAnnotationId);
1728 strBuff.append("\"");
1729 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1730 strBuff.append(">");
1731 }
1732 else if (originalMarkupsAnnotSet.contains(annot)) {
1733 strBuff.append("<");
1734 strBuff.append(annot.getType());
1735 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1736 strBuff.append(">");
1737 }
1738 else {
1739 strBuff.append("<");
1740 strBuff.append(annot.getType());
1741 strBuff.append(">");
1742 }
1743
1744 }else{
1745 if (includeFeatures) {
1749 strBuff.append("<");
1750 strBuff.append(annot.getType());
1751 strBuff.append(" ");
1752 if(includeNamespace) {
1753 strBuff.append("gate:");
1754 } strBuff.append("gateId=\"");
1756 strBuff.append(annot.getId());
1757 strBuff.append("\"");
1758 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1759 strBuff.append(">");
1760 }
1761 else if (originalMarkupsAnnotSet.contains(annot)) {
1762 strBuff.append("<");
1763 strBuff.append(annot.getType());
1764 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1765 strBuff.append(">");
1766 }
1767 else {
1768 strBuff.append("<");
1769 strBuff.append(annot.getType());
1770 strBuff.append(">");
1771 }
1772 } return strBuff.toString();
1774 }
1776
1786 private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet){
1787 if (anAnnotationSet == null) return null;
1788 Node startNode = anAnnotationSet.firstNode();
1791 Node endNode = anAnnotationSet.lastNode();
1792 if (startNode.getOffset().longValue() != 0) return null;
1797 Annotation theRootAnnotation = null;
1799 long start = startNode.getOffset().longValue();
1803 long end = endNode.getOffset().longValue();
1804 for(Iterator it = anAnnotationSet.iterator(); it.hasNext();){
1805 Annotation currentAnnot = (Annotation) it.next();
1806 if (
1809 (start == currentAnnot.getStartNode().getOffset().longValue()) &&
1810 (end == currentAnnot.getEndNode().getOffset().longValue())
1811 ){
1812 if (theRootAnnotation == null)
1814 theRootAnnotation = currentAnnot;
1815 else{
1816 if ( theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1818 theRootAnnotation = currentAnnot;
1819 } } } return theRootAnnotation;
1823 }
1825 private Annotation identifyTheRootAnnotation(List anAnnotationList){
1826 if (anAnnotationList == null || anAnnotationList.isEmpty()) return null;
1827 if(((Annotation)anAnnotationList.get(0)).
1830 getStartNode().getOffset().longValue() > 0) return null;
1831
1832 long start = 0; long end = 0; for(int i = 0; i < anAnnotationList.size(); i++){
1836 Annotation anAnnotation = (Annotation)anAnnotationList.get(i);
1837 long localEnd = anAnnotation.getEndNode().getOffset().longValue();
1838 if(localEnd > end) end = localEnd;
1839 }
1840
1841 Annotation theRootAnnotation = null;
1845 for(int i = 0; i < anAnnotationList.size(); i++){
1846 Annotation currentAnnot = (Annotation) anAnnotationList.get(i);
1847 long localStart = currentAnnot.getStartNode().getOffset().longValue();
1848 long localEnd = currentAnnot.getEndNode().getOffset().longValue();
1849 if (
1852 (start == localStart) && (end == localEnd)){
1853 if (theRootAnnotation == null) theRootAnnotation = currentAnnot;
1855 else{
1856 if (theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1858 theRootAnnotation = currentAnnot;
1859 } } } return theRootAnnotation;
1863 }
1865
1866
1871 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
1872 if (aScanString == null || aMapToFill == null) return;
1873 if (entitiesMap == null || entitiesMap.isEmpty()){
1874 Err.prln("WARNING: Entities map was not initialised !");
1875 return;
1876 } Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
1879 Character c;
1880 int fromIndex;
1881 while(entitiesMapIterator.hasNext()){
1882 c = (Character) entitiesMapIterator.next();
1883 fromIndex = 0;
1884 while (-1 != fromIndex){
1885 fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
1886 if (-1 != fromIndex){
1887 aMapToFill.put(new Long(fromIndex),c);
1888 fromIndex ++;
1889 } } } }
1894 private String writeEmptyTag(Annotation annot){
1895 return writeEmptyTag(annot, true);
1896 }
1898
1899 private String writeEmptyTag(Annotation annot, boolean includeNamespace){
1900 StringBuffer strBuff = new StringBuffer("");
1901 if (annot == null) return strBuff.toString();
1902
1903 strBuff.append("<");
1904 strBuff.append(annot.getType());
1905
1906 AnnotationSet originalMarkupsAnnotSet =
1907 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1908 if (! originalMarkupsAnnotSet.contains(annot)) {
1909 strBuff.append(" gateId=\"");
1910 strBuff.append(annot.getId());
1911 strBuff.append("\"");
1912 }
1913 strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace));
1914 strBuff.append("/>");
1915
1916 return strBuff.toString();
1917 }
1919
1920 private String writeEndTag(Annotation annot){
1921 StringBuffer strBuff = new StringBuffer("");
1922 if (annot == null) return strBuff.toString();
1923
1928 strBuff.append("</"+annot.getType()+">");
1929
1930 return strBuff.toString();
1931 }
1933
1934 private String writeFeatures(FeatureMap feat, boolean includeNamespace){
1935 StringBuffer strBuff = new StringBuffer("");
1936 if (feat == null) return strBuff.toString();
1937 Iterator it = feat.keySet().iterator();
1938 while (it.hasNext()){
1939 Object key = it.next();
1940 Object value = feat.get(key);
1941 if ( (key != null) && (value != null) ){
1942 if ("isEmptyAndSpan".equals(key.toString()))
1945 continue;
1946 if( !(String.class.isAssignableFrom(key.getClass()) ||
1947 Number.class.isAssignableFrom(key.getClass()))){
1948
1949 Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
1950 " from String or Number.(feature discarded)");
1951 continue;
1952 } if ( !(String.class.isAssignableFrom(value.getClass()) ||
1954 Number.class.isAssignableFrom(value.getClass()) ||
1955 java.util.Collection.class.isAssignableFrom(value.getClass()))){
1956
1957 Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
1958 " from String, Number or Collection.(feature discarded)");
1959 continue;
1960 } if ("matches".equals(key)) {
1962 strBuff.append(" ");
1963 if(includeNamespace) {
1964 strBuff.append("gate:");
1965 }
1966 strBuff.append(
1969 filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1970 strBuff.append("=\"");
1971 }
1972 else {
1973 strBuff.append(" ");
1974 strBuff.append(
1977 filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1978 strBuff.append("=\"");
1979 }
1980 if (java.util.Collection.class.isAssignableFrom(value.getClass())){
1981 Iterator valueIter = ((Collection)value).iterator();
1982 while(valueIter.hasNext()){
1983 Object item = valueIter.next();
1984 if (!(String.class.isAssignableFrom(item.getClass()) ||
1985 Number.class.isAssignableFrom(item.getClass())))
1986 continue;
1987 strBuff.append(
1990 filterNonXmlChars(replaceCharsWithEntities(item.toString())));
1991 strBuff.append(";");
1992 } if (strBuff.charAt(strBuff.length()-1) == ';')
1994 strBuff.deleteCharAt(strBuff.length()-1);
1995 }else{
1996 strBuff.append(
1999 filterNonXmlChars(replaceCharsWithEntities(value.toString())));
2000 } strBuff.append("\"");
2002 } } return strBuff.toString();
2005 }
2007
2012 public String toXml(){
2013 StringBuffer xmlContent = new StringBuffer(
2017 DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
2018 xmlContent.append("<?xml version=\"1.0\" encoding=\"");
2020 xmlContent.append(getEncoding());
2021 xmlContent.append("\" ?>");
2022 xmlContent.append(Strings.getNl());
2023
2024 xmlContent.append("<GateDocument>\n");
2026 xmlContent.append("<!-- The document's features-->\n\n");
2027 xmlContent.append("<GateDocumentFeatures>\n");
2028
2029 xmlContent.append(featuresToXml(this.getFeatures()));
2030 xmlContent.append("</GateDocumentFeatures>\n");
2031 xmlContent.append("<!-- The document content area with serialized"+
2032 " nodes -->\n\n");
2033 xmlContent.append("<TextWithNodes>");
2035 xmlContent.append(textWithNodes(this.getContent().toString()));
2036 xmlContent.append("</TextWithNodes>\n");
2037 StatusListener sListener = (StatusListener)
2040 gate.gui.MainFrame.getListeners().
2041 get("gate.event.StatusListener");
2042 if(sListener != null)
2043 sListener.statusChanged("Saving the default annotation set ");
2044 xmlContent.append("<!-- The default annotation set -->\n\n");
2045 xmlContent.append(annotationSetToXml(this.getAnnotations()));
2046 if (namedAnnotSets != null){
2049 Iterator iter = namedAnnotSets.values().iterator();
2050 while(iter.hasNext()){
2051 AnnotationSet annotSet = (AnnotationSet) iter.next();
2052 xmlContent.append("<!-- Named annotation set -->\n\n");
2053 if(sListener != null) sListener.statusChanged("Saving " +
2055 annotSet.getName()+
2056 " annotation set ");
2057 xmlContent.append(annotationSetToXml(annotSet));
2058 } } xmlContent.append("</GateDocument>");
2062 if(sListener != null) sListener.statusChanged("Done !");
2063 return xmlContent.toString();
2065 }
2067
2075 private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
2076 if (aStrBuffer == null) return new StringBuffer("");
2077 char space = ' ';
2079 for (int i=aStrBuffer.length()-1;i>=0; i--){
2080 if (!isXmlChar(aStrBuffer.charAt(i)))
2081 aStrBuffer.setCharAt(i, space);
2082 } return aStrBuffer;
2084 }
2086
2090 public static boolean isXmlChar(char ch){
2091 if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
2092 if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
2093 if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
2094 if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
2095 return false;
2096 }
2098
2103 private String featuresToXml(FeatureMap aFeatureMap){
2104 StringBuffer str = new StringBuffer("");
2105
2106 if (aFeatureMap == null) return str.toString();
2107
2108 Set keySet = aFeatureMap.keySet();
2109 Iterator keyIterator = keySet.iterator();
2110 while(keyIterator.hasNext()){
2111 Object key = keyIterator.next();
2112 Object value = aFeatureMap.get(key);
2113 if ((key != null) && (value != null)){
2114 String keyClassName = null;
2115 String keyItemClassName = null;
2116 String valueClassName = null;
2117 String valueItemClassName = null;
2118 String key2String = key.toString();
2119 String value2String = value.toString();
2120
2121 Object item = null;
2122 if (key instanceof java.lang.String ||
2124 key instanceof java.lang.Number ||
2125 key instanceof java.util.Collection)
2126 keyClassName = key.getClass().getName();
2127
2128 if (value instanceof java.lang.String ||
2130 value instanceof java.lang.Number ||
2131 value instanceof java.util.Collection)
2132 valueClassName = value.getClass().getName();
2133
2134 if (keyClassName == null || valueClassName == null) continue;
2137
2138 if (key instanceof java.util.Collection){
2140 StringBuffer keyStrBuff = new StringBuffer("");
2141 Iterator iter = ((Collection) key).iterator();
2142 if (iter.hasNext()){
2143 item = iter.next();
2144 if (item instanceof java.lang.Number)
2145 keyItemClassName = item.getClass().getName();
2146 else
2147 keyItemClassName = String.class.getName();
2148 keyStrBuff.append(item.toString());
2149 } while (iter.hasNext()){
2151 item = iter.next();
2152 keyStrBuff.append(";" + item.toString());
2153 } key2String = keyStrBuff.toString();
2155 } if (value instanceof java.util.Collection){
2158 StringBuffer valueStrBuff = new StringBuffer("");
2159 Iterator iter = ((Collection) value).iterator();
2160 if (iter.hasNext()){
2161 item = iter.next();
2162 if (item instanceof java.lang.Number)
2163 valueItemClassName = item.getClass().getName();
2164 else
2165 valueItemClassName = String.class.getName();
2166 valueStrBuff.append(item.toString());
2167 } while (iter.hasNext()){
2169 item = iter.next();
2170 valueStrBuff.append(";" + item.toString());
2171 } value2String = valueStrBuff.toString();
2173 } str.append("<Feature>\n <Name");
2175 if (keyClassName != null)
2176 str.append(" className=\""+keyClassName+"\"");
2177 if (keyItemClassName != null)
2178 str.append(" itemClassName=\""+keyItemClassName+"\"");
2179 str.append(">");
2180 str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
2181 str.append("</Name>\n <Value");
2182 if (valueClassName != null)
2183 str.append(" className=\"" + valueClassName + "\"");
2184 if (valueItemClassName != null)
2185 str.append(" itemClassName=\"" + valueItemClassName + "\"");
2186 str.append(">");
2187 str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
2188 str.append("</Value>\n</Feature>\n");
2189 } } return str.toString();
2192 }
2194
2201 private StringBuffer replaceCharsWithEntities(String anInputString){
2202 if (anInputString == null) return new StringBuffer("");
2203 StringBuffer strBuff = new StringBuffer(anInputString);
2204 for (int i=strBuff.length()-1; i>=0; i--){
2205 Character ch = new Character(strBuff.charAt(i));
2206 if (entitiesMap.keySet().contains(ch)){
2207 strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
2208 } } return strBuff;
2211 }
2213
2219 private String textWithNodes(String aText){
2220 if (aText == null) return new String("");
2221 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
2222
2223 TreeMap offsets2CharsMap = new TreeMap();
2225 if (aText.length()!= 0){
2226 buildEntityMapFromString(aText,offsets2CharsMap);
2228 } TreeSet offsetsSet = new TreeSet();
2231 Iterator annotSetIter = this.getAnnotations().iterator();
2232 while (annotSetIter.hasNext()){
2233 Annotation annot = (Annotation) annotSetIter.next();
2234 offsetsSet.add(annot.getStartNode().getOffset());
2235 offsetsSet.add(annot.getEndNode().getOffset());
2236 } if (namedAnnotSets != null){
2239 Iterator iter = namedAnnotSets.values().iterator();
2240 while(iter.hasNext()){
2241 AnnotationSet annotSet = (AnnotationSet) iter.next();
2242 Iterator iter2 = annotSet.iterator();
2243 while(iter2.hasNext()){
2244 Annotation annotTmp = (Annotation) iter2.next();
2245 offsetsSet.add(annotTmp.getStartNode().getOffset());
2246 offsetsSet.add(annotTmp.getEndNode().getOffset());
2247 } } }
2253 if (offsetsSet.isEmpty()){
2254 return replaceCharsWithEntities(aText).toString();
2255 } while (!offsetsSet.isEmpty()){
2260 Long offset = (Long) offsetsSet.last();
2261 offsetsSet.remove(offset);
2263 int offsetValue = offset.intValue();
2265 String strNode = "<Node id=\"" + offsetValue + "\"/>";
2266 if (!offsets2CharsMap.isEmpty()){
2269 Long offsChar = (Long) offsets2CharsMap.lastKey();
2270 while( !offsets2CharsMap.isEmpty() &&
2271 offsChar.intValue() >= offset.intValue()){
2272 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2275 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2276 offsets2CharsMap.remove(offsChar);
2279 if (!offsets2CharsMap.isEmpty())
2281 offsChar = (Long) offsets2CharsMap.lastKey();
2282 } } textWithNodes.insert(offsetValue,strNode);
2286 } while (!offsets2CharsMap.isEmpty()){
2291 Long offsChar = (Long) offsets2CharsMap.lastKey();
2292 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2294 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2295 offsets2CharsMap.remove(offsChar);
2297 } return textWithNodes.toString();
2299 }
2301
2306 private String annotationSetToXml(AnnotationSet anAnnotationSet){
2307 StringBuffer str = new StringBuffer("");
2308
2309 if (anAnnotationSet == null){
2310 str.append("<AnnotationSet>\n");
2311 str.append("</AnnotationSet>\n");
2312 return str.toString();
2313 } if (anAnnotationSet.getName() == null)
2315 str.append("<AnnotationSet>\n");
2316 else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
2317 "\" >\n");
2318 Iterator iterator = anAnnotationSet.iterator();
2320 while (iterator.hasNext()){
2321 Annotation annot = (Annotation) iterator.next();
2322 str.append("<Annotation " + "Type=\"" + annot.getType() +
2323 "\" StartNode=\"" + annot.getStartNode().getOffset() +
2324 "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
2325 str.append(featuresToXml(annot.getFeatures()));
2326 str.append("</Annotation>\n");
2327 }
2329 str.append("</AnnotationSet>\n");
2330 return str.toString();
2331 }
2333
2335 public Map getNamedAnnotationSets() {
2336 return namedAnnotSets;
2337 }
2339
2341 public Set getAnnotationSetNames(){
2342 return namedAnnotSets.keySet();
2343 }
2344
2345
2346
2351 public void removeAnnotationSet(String name){
2352 Object removed = namedAnnotSets.remove(name);
2353 if(removed != null){
2354 fireAnnotationSetRemoved(
2355 new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
2356 }
2357 }
2358
2359
2360 public void edit(Long start, Long end, DocumentContent replacement)
2361 throws InvalidOffsetException
2362 {
2363 if(! isValidOffsetRange(start, end))
2364 throw new InvalidOffsetException();
2365
2366 if(content != null)
2367 ((DocumentContentImpl) content).edit(start, end, replacement);
2368
2369 if(defaultAnnots != null)
2370 ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
2371
2372 if(namedAnnotSets != null) {
2373 Iterator iter = namedAnnotSets.values().iterator();
2374 while(iter.hasNext())
2375 ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
2376 }
2377 fireContentEdited(new DocumentEvent(this, DocumentEvent.CONTENT_EDITED,
2379 start, end));
2380 }
2382
2385 public boolean isValidOffset(Long offset) {
2386 if(offset == null)
2387 return false;
2388
2389 long o = offset.longValue();
2390 if(o > getContent().size().longValue() || o < 0)
2391 return false;
2392
2393 return true;
2394 }
2396
2400 public boolean isValidOffsetRange(Long start, Long end) {
2401 return
2402 isValidOffset(start) && isValidOffset(end) &&
2403 start.longValue() <= end.longValue();
2404 }
2406
2407 public void setNextAnnotationId(int aNextAnnotationId){
2408 nextAnnotationId = aNextAnnotationId;
2409 }
2411
2412 public Integer getNextAnnotationId() {
2413 return new Integer(nextAnnotationId++);
2414 }
2416
2417 public Integer getNextNodeId() { return new Integer(nextNodeId++); }
2418
2419
2420 public int compareTo(Object o) throws ClassCastException {
2421 DocumentImpl other = (DocumentImpl) o;
2422 return getOrderingString().compareTo(other.getOrderingString());
2423 }
2425
2428 protected String getOrderingString() {
2429 if(sourceUrl == null) return toString();
2430
2431 StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
2432 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
2433 orderingString.append(sourceUrlStartOffset.toString());
2434 orderingString.append(sourceUrlEndOffset.toString());
2435 }
2436
2437 return orderingString.toString();
2438 }
2440
2441 protected int nextAnnotationId = 0;
2442
2443
2444 protected int nextNodeId = 0;
2445
2446 protected URL sourceUrl;
2447
2448
2449
2450
2451 protected DocumentContent content;
2452
2453
2454 protected String encoding = null;
2455
2456
2458
2462
2464
2467 private Annotation theRootAnnotation = null;
2468
2469
2473 private final int DOC_SIZE_MULTIPLICATION_FACTOR = 2;
2474
2475
2478 private final int ORDER_ON_START_OFFSET = 0;
2479
2482 private final int ORDER_ON_END_OFFSET = 1;
2483
2486 private final int ORDER_ON_ANNOT_ID = 2;
2487
2490 private final int ASC = 3;
2491
2494 private final int DESC = -3;
2495
2496
2499 private static Map entitiesMap = null;
2500 static{
2502 entitiesMap = new HashMap();
2503 entitiesMap.put(new Character('<'),"<");
2504 entitiesMap.put(new Character('>'),">");
2505 entitiesMap.put(new Character('&'),"&");
2506 entitiesMap.put(new Character('\''),"'");
2507 entitiesMap.put(new Character('"'),""");
2508 entitiesMap.put(new Character((char)160)," ");
2509 entitiesMap.put(new Character((char)169),"©");
2510 }
2512
2515
2517
2520 protected Long sourceUrlStartOffset;
2521
2522
2525 protected Long sourceUrlEndOffset;
2526
2527
2528 protected AnnotationSet defaultAnnots;
2529
2530
2531 protected Map namedAnnotSets;
2532
2533
2538 private String stringContent;
2539
2540
2548 public String getStringContent() { return stringContent; }
2549
2550
2558 public void setStringContent(String stringContent) {
2559 this.stringContent = stringContent;
2560 }
2562
2563 protected Boolean markupAware = new Boolean(false);
2564
2589
2590 public String toString() {
2591 String n = Strings.getNl();
2592 StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2593 s.append(" content:" + content + n);
2594 s.append(" defaultAnnots:" + defaultAnnots + n);
2595 s.append(" encoding:" + encoding + n);
2596 s.append(" features:" + features + n);
2597 s.append(" markupAware:" + markupAware + n);
2598 s.append(" namedAnnotSets:" + namedAnnotSets + n);
2599 s.append(" nextAnnotationId:" + nextAnnotationId + n);
2600 s.append(" nextNodeId:" + nextNodeId + n);
2601 s.append(" sourceUrl:" + sourceUrl + n);
2602 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2603 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2604 s.append(n);
2605
2606 return s.toString();
2607 }
2609
2610 static final long serialVersionUID = -8456893608311510260L;
2611
2612
2613 class AnnotationComparator implements java.util.Comparator {
2614 int orderOn = -1;
2615 int orderType = ASC;
2616
2619 public AnnotationComparator(int anOrderOn, int anOrderType){
2620 orderOn = anOrderOn;
2621 orderType = anOrderType;
2622 }
2624
2625 public int compare(Object o1, Object o2){
2626 Annotation a1 = (Annotation) o1;
2627 Annotation a2 = (Annotation) o2;
2628 if (orderOn == ORDER_ON_START_OFFSET){
2630 int result = a1.getStartNode().getOffset().compareTo(
2631 a2.getStartNode().getOffset());
2632 if (orderType == ASC){
2633 if (result == 0)
2636 return a1.getId().compareTo(a2.getId());
2637 return result;
2638 }else{
2639 if (result == 0)
2641 return - (a1.getId().compareTo(a2.getId()));
2642 return -result;
2643 } }
2646 if (orderOn == ORDER_ON_END_OFFSET){
2648 int result = a1.getEndNode().getOffset().compareTo(
2649 a2.getEndNode().getOffset());
2650 if (orderType == ASC){
2651 if (result == 0)
2654 return - (a1.getId().compareTo(a2.getId()));
2655 return result;
2656 }else{
2657 if (result == 0)
2660 return a1.getId().compareTo(a2.getId());
2661 return - result;
2662 } }
2665 if (orderOn == ORDER_ON_ANNOT_ID){
2667 if (orderType == ASC)
2668 return a1.getId().compareTo(a2.getId());
2669 else
2670 return -(a1.getId().compareTo(a2.getId()));
2671 } return 0;
2673 } }
2676
2677 private transient Vector documentListeners;
2678 private transient Vector gateListeners;
2679
2680 public synchronized void removeDocumentListener(DocumentListener l) {
2681 if (documentListeners != null && documentListeners.contains(l)) {
2682 Vector v = (Vector) documentListeners.clone();
2683 v.removeElement(l);
2684 documentListeners = v;
2685 }
2686 }
2687 public synchronized void addDocumentListener(DocumentListener l) {
2688 Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
2689 if (!v.contains(l)) {
2690 v.addElement(l);
2691 documentListeners = v;
2692 }
2693 }
2694
2695 protected void fireAnnotationSetAdded(DocumentEvent e) {
2696 if (documentListeners != null) {
2697 Vector listeners = documentListeners;
2698 int count = listeners.size();
2699 for (int i = 0; i < count; i++) {
2700 ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
2701 }
2702 }
2703 }
2704
2705 protected void fireAnnotationSetRemoved(DocumentEvent e) {
2706 if (documentListeners != null) {
2707 Vector listeners = documentListeners;
2708 int count = listeners.size();
2709 for (int i = 0; i < count; i++) {
2710 ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
2711 }
2712 }
2713 }
2714
2715 protected void fireContentEdited(DocumentEvent e) {
2716 if (documentListeners != null) {
2717 Vector listeners = documentListeners;
2718 int count = listeners.size();
2719 for (int i = 0; i < count; i++) {
2720 ((DocumentListener) listeners.elementAt(i)).contentEdited(e);
2721 }
2722 }
2723 }
2724
2725 public void resourceLoaded(CreoleEvent e) {
2726 }
2727 public void resourceUnloaded(CreoleEvent e) {
2728 }
2729 public void datastoreOpened(CreoleEvent e) {
2730 }
2731 public void datastoreCreated(CreoleEvent e) {
2732 }
2733 public void resourceRenamed(Resource resource, String oldName,
2734 String newName){
2735 }
2736 public void datastoreClosed(CreoleEvent e) {
2737 if (! e.getDatastore().equals(this.getDataStore()))
2738 return;
2739 Factory.deleteResource(this);
2742 }
2743 public void setLRPersistenceId(Object lrID) {
2744 super.setLRPersistenceId( lrID);
2745 Gate.getCreoleRegister().addCreoleListener(this);
2748 }
2749 public void resourceAdopted(DatastoreEvent evt) {
2750 }
2751 public void resourceDeleted(DatastoreEvent evt) {
2752 if(! evt.getSource().equals(this.getDataStore()))
2753 return;
2754 if(evt.getResourceID().equals(this.getLRPersistenceId()))
2757 Factory.deleteResource(this);
2758 }
2759 public void resourceWritten(DatastoreEvent evt) {
2760 }
2761 public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException {
2762 super.setDataStore( dataStore);
2763 if (this.dataStore != null)
2764 this.dataStore.addDatastoreListener(this);
2765 }
2766
2767
2772 public void setDefaultAnnotations(AnnotationSet defaultAnnotations) {
2773 defaultAnnots = defaultAnnotations;
2774 }
2775
2776}