package gate.compound.impl; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.StringReader; import java.io.StringWriter; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Vector; import gate.AnnotationSet; import gate.DataStore; import gate.Document; import gate.DocumentContent; import gate.Factory; import gate.FeatureMap; import gate.Gate; import gate.Resource; import gate.alignment.Alignment; import gate.annotation.AnnotationSetImpl; import gate.compound.CompoundDocument; import gate.corpora.DocumentContentImpl; import gate.corpora.DocumentImpl; import gate.creole.ResourceInstantiationException; import gate.event.CreoleEvent; import gate.event.DatastoreEvent; import gate.event.DocumentListener; import gate.util.GateRuntimeException; import gate.util.InvalidOffsetException; import gate.util.SimpleFeatureMapImpl; import gate.util.Strings; /** * This is an abstract implementation of the AbstractAlignedDocument * This class overrides the methods of DocumentImpl and provide generic * implementation of some of the methods of AlignedDocument * * @author niraj */ public abstract class AbstractCompoundDocument extends DocumentImpl implements CompoundDocument { /** The encoding of the source of the document content */ protected String encoding = null; /** * If you set this flag to true the repositioning information for the * document will be kept in the document feature. <br> * Default value is false to avoid the unnecessary waste of time and * memory */ protected Boolean collectRepositioningInfo = new Boolean(false); /** * If you set this flag to true the original content of the document * will be kept in the document feature. <br> * Default value is false to avoid the unnecessary waste of memory */ protected Boolean preserveOriginalContent = new Boolean(false); /** * Languages */ protected List<String> documentIDs; /** The source URL */ protected URL sourceUrl; /** * Curent Document */ protected Document currentDocument; /** * Available documents */ protected Map<String, Document> documents; /** Freeze the serialization UID. */ static final long serialVersionUID = -8456893608311510260L; private transient Vector<DocumentListener> documentListeners; /** * The start of the range that the content comes from at the source * URL (or null if none). */ protected Long sourceUrlStartOffset; /** * The end of the range that the content comes from at the source URL * (or null if none). */ protected Long sourceUrlEndOffset; /** Is the document markup-aware? */ protected Boolean markupAware = new Boolean(false); /** Clear all the data members of the object. */ public void cleanup() { Iterator<Document> iter = documents.values().iterator(); while(iter.hasNext()) { Document doc = iter.next(); doc.cleanup(); } } // cleanup() /** Cover unpredictable Features creation */ public FeatureMap getFeatures() { if(currentDocument == null) { if(features == null) { features = new SimpleFeatureMapImpl(); } return features; } else { return currentDocument.getFeatures(); } } /** Documents are identified by URLs */ public URL getSourceUrl() { if(currentDocument == null) { return sourceUrl; } return currentDocument.getSourceUrl(); } /** Set method for the document's URL */ public void setSourceUrl(URL sourceUrl) { if(currentDocument == null) { this.sourceUrl = sourceUrl; } else { currentDocument.setSourceUrl(sourceUrl); } } // setSourceUrl /** * Documents may be packed within files; in this case an optional pair * of offsets refer to the location of the document. */ public Long[] getSourceUrlOffsets() { if(currentDocument == null) { return new Long[] {sourceUrlStartOffset, sourceUrlEndOffset}; } return currentDocument.getSourceUrlOffsets(); } // getSourceUrlOffsets /** * Allow/disallow preserving of the original document content. If is * <B>true</B> the original content will be retrieved from the * DocumentContent object and preserved as document feature. */ public void setPreserveOriginalContent(Boolean b) { if(currentDocument == null) { this.preserveOriginalContent = b; } else { currentDocument.setPreserveOriginalContent(b); } } // setPreserveOriginalContent /** * Get the preserving of content status of the Document. * * @return whether the Document should preserve it's original content. */ public Boolean getPreserveOriginalContent() { if(currentDocument == null) { return preserveOriginalContent; } else { return currentDocument.getPreserveOriginalContent(); } } // getPreserveOriginalContent /** * Allow/disallow collecting of repositioning information. If is * <B>true</B> information will be retrieved and preserved as * document feature.<BR> * Preserving of repositioning information give the possibilities for * converting of coordinates between the original document content and * extracted from the document text. */ public void setCollectRepositioningInfo(Boolean b) { if(currentDocument == null) { collectRepositioningInfo = b; } else { currentDocument.setCollectRepositioningInfo(b); } } // setCollectRepositioningInfo /** * Get the collectiong and preserving of repositioning information for * the Document. <BR> * Preserving of repositioning information give the possibilities for * converting of coordinates between the original document content and * extracted from the document text. * * @return whether the Document should collect and preserve * information. */ public Boolean getCollectRepositioningInfo() { if(currentDocument == null) { return collectRepositioningInfo; } else { return currentDocument.getCollectRepositioningInfo(); } } // getCollectRepositioningInfo /** * Documents may be packed within files; in this case an optional pair * of offsets refer to the location of the document. This method gets * the start offset. */ public Long getSourceUrlStartOffset() { if(currentDocument == null) { return sourceUrlStartOffset; } else { return currentDocument.getSourceUrlStartOffset(); } } /** * Documents may be packed within files; in this case an optional pair * of offsets refer to the location of the document. This method sets * the start offset. */ public void setSourceUrlStartOffset(Long sourceUrlStartOffset) { if(currentDocument == null) { this.sourceUrlStartOffset = sourceUrlStartOffset; } else { currentDocument.setSourceUrlStartOffset(sourceUrlStartOffset); } } // setSourceUrlStartOffset /** * Documents may be packed within files; in this case an optional pair * of offsets refer to the location of the document. This method gets * the end offset. */ public Long getSourceUrlEndOffset() { if(currentDocument == null) { return sourceUrlEndOffset; } else { return currentDocument.getSourceUrlEndOffset(); } } /** * Documents may be packed within files; in this case an optional pair * of offsets refer to the location of the document. This method sets * the end offset. */ public void setSourceUrlEndOffset(Long sourceUrlEndOffset) { if(currentDocument == null) { this.sourceUrlEndOffset = sourceUrlEndOffset; } else { currentDocument.setSourceUrlEndOffset(sourceUrlEndOffset); } } // setSourceUrlStartOffset /** * The content of the document: a String for text; MPEG for video; * etc. */ public DocumentContent getContent() { if(currentDocument == null) { // throw new RuntimeException( // "CompoundDocumentImpl does not contain any text but its member // does!" // + " Please use the setDocument(String documentID) to set a // specific document!"); return new DocumentContentImpl(""); } else { return currentDocument.getContent(); } } /** Set method for the document content */ public void setContent(DocumentContent content) { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not have any content but its member does!" + " Please use the setDocument(String documentID) to set a specific document!"); } else { currentDocument.setContent(content); } } /** Get the encoding of the document content source */ public String getEncoding() { if(currentDocument == null) { return this.encoding; } else { return ((DocumentImpl)currentDocument).getEncoding(); } } /** Set the encoding of the document content source */ public void setEncoding(String encoding) { if(currentDocument == null) { this.encoding = encoding; } else { ((DocumentImpl)currentDocument).setEncoding(encoding); } } /** * Get the default set of annotations. The set is created if it * doesn't exist yet. */ public AnnotationSet getAnnotations() { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not have any annotationSet but its member does!" + " Please use the setDocument(String documentID) to set a specific document!"); return new AnnotationSetImpl(this); } else { return currentDocument.getAnnotations(); } } // getAnnotations() /** * Get a named set of annotations. Creates a new set if one with this * name doesn't exist yet. If the provided name is null then it * returns the default annotation set. */ public AnnotationSet getAnnotations(String name) { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not have any annotationSet but its member does!" + " Please use the setDocument(String documentID) to set a specific document!"); return new AnnotationSetImpl(this); } else { return currentDocument.getAnnotations(name); } } // getAnnotations(name) /** * Make the document markup-aware. This will trigger the creation of a * DocumentFormat object at Document initialisation time; the * DocumentFormat object will unpack the markup in the Document and * add it as annotations. Documents are <B>not</B> markup-aware by * default. * * @param newMarkupAware markup awareness status. */ public void setMarkupAware(Boolean newMarkupAware) { if(currentDocument == null) { this.markupAware = newMarkupAware; } else { currentDocument.setMarkupAware(newMarkupAware); } } /** * Get the markup awareness status of the Document. <B>Documents are * markup-aware by default.</B> * * @return whether the Document is markup aware. */ public Boolean getMarkupAware() { if(currentDocument == null) { return this.markupAware; } else { return currentDocument.getMarkupAware(); } } /** * Returns an XML document aming to preserve the original markups( the * original markup will be in the same place and format as it was * before processing the document) and include (if possible) the * annotations specified in the aSourceAnnotationSet. It is equivalent * to toXml(aSourceAnnotationSet, true). */ public String toXml(Set aSourceAnnotationSet) { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not implement toXml(Set) but its member does!" + " Please use the setDocument(String documentID) to set a specific document!"); return currentDocument.toXml(new AnnotationSetImpl(this)); } else { return currentDocument.toXml(aSourceAnnotationSet); } } /** * Returns an XML document aming to preserve the original markups( the * original markup will be in the same place and format as it was * before processing the document) and include (if possible) the * annotations specified in the aSourceAnnotationSet. <b>Warning:</b> * Annotations from the aSourceAnnotationSet will be lost if they will * cause a crosed over situation. * * @param aSourceAnnotationSet is an annotation set containing all the * annotations that will be combined with the original marup * set. If the param is <code>null</code> it will only dump * the original markups. * @param includeFeatures is a boolean that controls whether the * annotation features should be included or not. If false, * only the annotation type is included in the tag. * @return a string representing an XML document containing the * original markup + dumped annotations form the * aSourceAnnotationSet */ public String toXml(Set aSourceAnnotationSet, boolean includeFeatures) { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not implement toXml(Set, boolean) but its member does!" + " Please use the setDocument(String documentID) to set a specific document!"); return currentDocument.toXml(new AnnotationSetImpl(this)); } else { return currentDocument.toXml(aSourceAnnotationSet, includeFeatures); } }// End toXml() /** * Returns a GateXml document that is a custom XML format for wich * there is a reader inside GATE called gate.xml.GateFormatXmlHandler. * What it does is to serialize a GATE document in an XML format. * * @return a string representing a Gate Xml document. */ public String toXml() { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not implement toXml() but its member does!" + " Please use the setDocument(String documentID) to set a specific document!"); return currentDocument.toXml(new AnnotationSetImpl(this)); } else { return currentDocument.toXml(); } }// toXml /** * Gives a single XML representation for the entire document. * * @param aCompoundDoc * @return */ public static String toXmlAsASingleDocument(CompoundDocument aCompoundDoc) { Map<String, String> docXmls = new HashMap<String, String>(); Map<String, Object> globalMap = new HashMap<String, Object>(); for(String id : aCompoundDoc.getDocumentIDs()) { docXmls.put(id, aCompoundDoc.getDocument(id).toXml()); } // add document xmls globalMap.put("docXmls", docXmls); // we would use XStream library to store annic patterns com.thoughtworks.xstream.XStream xstream = new com.thoughtworks.xstream.XStream(); // Saving is accomplished just using XML serialization of the map. StringWriter stringToReturn = new StringWriter(); // other features Map<String, Object> features = new HashMap<String, Object>(); features.put("encoding", aCompoundDoc.getEncoding()); features.put("collectRepositioningInfo", aCompoundDoc .getCollectRepositioningInfo()); features.put("preserveOriginalContent", aCompoundDoc .getPreserveOriginalContent()); features.put("documentIDs", aCompoundDoc.getDocumentIDs()); features.put("markupAware", new Boolean(true)); features.put("name", aCompoundDoc.getName()); globalMap.put("feats", features); Document aDoc = aCompoundDoc.getCurrentDocument(); aCompoundDoc.setCurrentDocument(null); globalMap.put("docFeats", aCompoundDoc.getFeatures()); if(aDoc != null) aCompoundDoc.setCurrentDocument(aDoc.getName()); xstream.toXML(globalMap, stringToReturn); return stringToReturn.toString(); } /** * Loads the compound document with given xmlString. Please note that * the string should have been generated with the * toXmlAsASingleDocument method. * * @param xmlString * @return */ public static CompoundDocument fromXml(String xmlString) { StringReader reader = new StringReader(xmlString); com.thoughtworks.xstream.XStream xstream = new com.thoughtworks.xstream.XStream( new com.thoughtworks.xstream.io.xml.StaxDriver()); // asking the xstream library to use gate class loader xstream.setClassLoader(Gate.getClassLoader()); // reading the xml object Map<String, Object> globalMap = (HashMap<String, Object>)xstream .fromXML(reader); // now we read individual information Map<String, String> docXmls = (HashMap<String, String>)globalMap .get("docXmls"); Map<String, Object> features = (Map<String, Object>)globalMap.get("feats"); String encoding = (String)features.get("encoding"); try { File tempFile = File.createTempFile("example", ".xml"); File tempFolder = new File(tempFile.getParentFile(), "temp" + Gate.genSym()); if(!tempFolder.exists() && !tempFolder.mkdirs()) { throw new GateRuntimeException("Temporary folder " + tempFolder.getAbsolutePath() + " could not be created"); } tempFile.deleteOnExit(); tempFolder.deleteOnExit(); URL sourceUrl = null; List<String> docIDs = new ArrayList<String>(); for(String id : docXmls.keySet()) { docIDs.add(id); File newFile = new File("X." + id + ".xml"); newFile.deleteOnExit(); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(newFile), encoding)); bw.write(docXmls.get(id)); bw.flush(); bw.close(); sourceUrl = newFile.toURI().toURL(); } features.put("sourceUrl", sourceUrl); String name = (String)features.get("name"); features.remove("name"); FeatureMap fets = Factory.newFeatureMap(); for(String s : features.keySet()) { fets.put(s, features.get(s)); } FeatureMap hideFeats = Factory.newFeatureMap(); //Gate.setHiddenAttribute(hideFeats, true); // CompoundDocument cd = new gate.compound.impl.CompoundDocumentImpl(); // cd.setName(name); // cd.setSourceUrl(sourceUrl); // ((CompoundDocumentImpl) cd).setEncoding(encoding); // ((CompoundDocumentImpl) cd).setDocumentIDs(docIDs); // cd.init(); CompoundDocument cd = (CompoundDocument)Factory.createResource( "gate.compound.impl.CompoundDocumentImpl", fets, hideFeats); cd.setName(name); Document aDoc = cd.getCurrentDocument(); cd.setCurrentDocument(null); FeatureMap docFeatures = (FeatureMap)globalMap.get("docFeats"); for(Object key : docFeatures.keySet()) { Object value = docFeatures.get(key); if(value instanceof Alignment) { ((Alignment)value).setSourceDocument(cd); } } cd.setFeatures(docFeatures); if(aDoc != null) cd.setCurrentDocument(aDoc.getName()); return cd; } catch(IOException ioe) { throw new GateRuntimeException(ioe); } catch(ResourceInstantiationException rie) { throw new GateRuntimeException(rie); } finally { if(reader != null) reader.close(); } } /** * Returns a map with the named annotation sets. It returns * <code>null</code> if no named annotaton set exists. */ public Map<String, AnnotationSet> getNamedAnnotationSets() { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not implement getNamedAnnotationSets() but its member does!" + " Please use the setDocument(String documentID) to set a specific document!"); return new HashMap<String, AnnotationSet>(); } else { return currentDocument.getNamedAnnotationSets(); } } // getNamedAnnotationSets public Set<String> getAnnotationSetNames() { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not implement getAnnotationSetNames() but its member does!" + " Please use the setDocument(String documentID) to set a specific document!"); return new HashSet<String>(); } else { return currentDocument.getAnnotationSetNames(); } } /** * Removes one of the named annotation sets. Note that the default * annotation set cannot be removed. * * @param name the name of the annotation set to be removed */ public void removeAnnotationSet(String name) { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not have any annotationSets!" + " Please use the setDocument(String documentID) to set a specific document!"); } else { currentDocument.removeAnnotationSet(name); } } /** Propagate edit changes to the document content and annotations. */ public void edit(Long start, Long end, DocumentContent replacement) throws InvalidOffsetException { if(currentDocument == null) { System.err .println("CompoundDocumentImpl does not have any content!" + " Please use the setDocument(String documentID) to set a specific document!"); } else { currentDocument.edit(start, end, replacement); } } // edit(start,end,replacement) /** Ordering based on URL.toString() and the URL offsets (if any) */ public int compareTo(Object o) throws ClassCastException { CompoundDocument other = (CompoundDocument)o; return toString().compareTo(other.toString()); } // compareTo /** String respresentation */ public String toString() { if(currentDocument == null) { String n = Strings.getNl(); StringBuffer s = new StringBuffer("CompoundDocumentImpl: " + n); s.append(" encoding:" + encoding + n); s.append(" features:" + features + n); s.append(" markupAware:" + markupAware + n); s.append(" sourceUrl:" + sourceUrl + n); s.append(n); return s.toString(); } else { return currentDocument.toString(); } } // toString public void removeDocument(String documentID) { Document doc = (Document)documents.get(documentID); if(doc == null) return; Factory.deleteResource(doc); } public synchronized void removeDocumentListener(DocumentListener l) { if(currentDocument != null) { currentDocument.removeDocumentListener(l); } else { if(documentListeners != null && documentListeners.contains(l)) { Vector v = (Vector)documentListeners.clone(); v.removeElement(l); documentListeners = v; } } } public synchronized void addDocumentListener(DocumentListener l) { if(currentDocument != null) { currentDocument.addDocumentListener(l); } else { Vector v = documentListeners == null ? new Vector<DocumentListener>(2) : (Vector)documentListeners.clone(); if(!v.contains(l)) { v.addElement(l); documentListeners = v; } } } public void resourceLoaded(CreoleEvent e) { } public void resourceUnloaded(CreoleEvent e) { } public void datastoreOpened(CreoleEvent e) { } public void datastoreCreated(CreoleEvent e) { } public void resourceRenamed(Resource resource, String oldName, String newName) { } private void deleteAllDocs() { Set keys = documents.keySet(); Iterator iter = keys.iterator(); while(iter.hasNext()) { Object key = iter.next(); Document doc = (Document)documents.get(key); Factory.deleteResource(doc); } } public void datastoreClosed(CreoleEvent e) { if(!e.getDatastore().equals(this.getDataStore())) return; // we also remove other documents deleteAllDocs(); // close this lr, since it cannot stay open when the DS it comes // from // is closed Factory.deleteResource(this); } public void setLRPersistenceId(Object lrID) { super.setLRPersistenceId(lrID); // make persistent documents listen to the creole register // for events about their DS Gate.getCreoleRegister().addCreoleListener(this); } public void resourceAdopted(DatastoreEvent evt) { } public void resourceDeleted(DatastoreEvent evt) { if(!evt.getSource().equals(this.getDataStore())) return; // if an open document is deleted from a DS, then // it must close itself immediately, as is no longer valid if(evt.getResourceID().equals(this.getLRPersistenceId())) { deleteAllDocs(); Factory.deleteResource(this); } } public void resourceWritten(DatastoreEvent evt) { } public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException { super.setDataStore(dataStore); if(this.dataStore != null) this.dataStore.addDatastoreListener(this); } public Document getCurrentDocument() { return currentDocument; } public Document getDocument(String documentID) { Object obj = documents.get(documentID); if(obj == null) { return this; } else { return (Document)obj; } } public void setCurrentDocument(String documentID) { if(documentID == null) { currentDocument = null; return; } Object obj = documents.get(documentID); if(obj == null) { currentDocument = null; } else { currentDocument = (Document)obj; } } public Map getDocuments() { return documents; } public List<String> getDocumentIDs() { return documentIDs; } public void setDocumentIDs(List<String> docIDs) { if(docIDs != null) { this.documentIDs = new ArrayList<String>(); this.documentIDs.addAll(docIDs); } else { this.documentIDs = null; } } }