/* * TestXml.java * * Copyright (c) 1995-2012, The University of Sheffield. See the file * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * * Cristian URSU, 8/May/2000 * * $Id: TestXml.java 17656 2014-03-14 08:55:23Z markagreenwood $ */ package gate.xml; import gate.Annotation; import gate.AnnotationSet; import gate.Corpus; import gate.Document; import gate.DocumentFormat; import gate.Factory; import gate.FeatureMap; import gate.Gate; import gate.GateConstants; import gate.corpora.DocumentImpl; import gate.corpora.TestDocument; import gate.creole.ANNIEConstants; import gate.creole.ConditionalSerialAnalyserController; import gate.util.Files; import gate.util.persistence.PersistenceManager; import java.io.File; import java.net.URL; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import junit.framework.Test; import junit.framework.TestCase; import junit.framework.TestSuite; //import org.w3c.www.mime.*; /** Test class for XML facilities * */ public class TestXml extends TestCase { /** The encoding used in our tests*/ private static String workingEncoding="UTF-8"; /** Construction */ public TestXml(String name) { super(name); } /** Fixture set up */ @Override public void setUp() { } // setUp public void testGateDocumentToAndFromXmlWithDifferentKindOfFormats() throws Exception{ List<URL> urlList = new LinkedList<URL>(); List<String> urlDescription = new LinkedList<String>(); URL url = null; url = new URL(TestDocument.getTestServerName()+"tests/xml/xces.xml"); assertTrue("Coudn't create a URL object for tests/xml/xces.xml ", url != null); urlList.add(url); urlDescription.add(" an XML document "); url = new URL(TestDocument.getTestServerName()+"tests/xml/Sentence.xml"); assertTrue("Coudn't create a URL object for tests/xml/Sentence.xml", url != null); urlList.add(url); urlDescription.add(" an XML document "); url = new URL(TestDocument.getTestServerName()+"tests/html/test1.htm"); assertTrue("Coudn't create a URL object for tests/html/test.htm",url != null); urlList.add(url); urlDescription.add(" an HTML document "); url = new URL(TestDocument.getTestServerName()+"tests/email/test2.eml"); assertTrue("Coudn't create a URL object for defg ",url != null); urlList.add(url); urlDescription.add(" an EMAIL document "); Iterator<URL> iter = urlList.iterator(); Iterator<String> descrIter = urlDescription.iterator(); while(iter.hasNext()){ runCompleteTestWithAFormat(iter.next(), descrIter.next()); }// End While }// testGateDocumentToAndFromXmlWithDifferentKindOfFormats private void runCompleteTestWithAFormat(URL url, String urlDescription) throws Exception{ // Load the xml Key Document and unpack it gate.Document keyDocument = null; FeatureMap params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_URL_PARAMETER_NAME, url); params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME, "false"); keyDocument = (Document)Factory.createResource("gate.corpora.DocumentImpl", params); assertTrue("Coudn't create a GATE document instance for " + url.toString() + " Can't continue." , keyDocument != null); gate.DocumentFormat keyDocFormat = null; keyDocFormat = gate.DocumentFormat.getDocumentFormat( keyDocument, keyDocument.getSourceUrl() ); assertTrue("Fail to recognize " + url.toString() + " as being " + urlDescription + " !", keyDocFormat != null); // Unpack the markup keyDocFormat.unpackMarkup(keyDocument); // Verfy if all annotations from the default annotation set are consistent gate.corpora.TestDocument.verifyNodeIdConsistency(keyDocument); // Verifies if the maximum annotation ID on the GATE doc is less than the // Annotation ID generator of the document. verifyAnnotationIDGenerator(keyDocument); // Save the size of the document and the number of annotations long keyDocumentSize = keyDocument.getContent().size().longValue(); int keyDocumentAnnotationSetSize = keyDocument.getAnnotations().size(); // Export the Gate document called keyDocument as XML, into a temp file, // using the working encoding File xmlFile = null; xmlFile = Files.writeTempFile(keyDocument.toXml(), workingEncoding ); assertTrue("The temp GATE XML file is null. Can't continue.",xmlFile != null); // Load the XML Gate document form the tmp file into memory gate.Document gateDoc = null; gateDoc = gate.Factory.newDocument(xmlFile.toURI().toURL(), workingEncoding); assertTrue("Coudn't create a GATE document instance for " + xmlFile.toURI().toURL().toString() + " Can't continue." , gateDoc != null); gate.DocumentFormat gateDocFormat = null; gateDocFormat = DocumentFormat.getDocumentFormat(gateDoc,gateDoc.getSourceUrl()); assertTrue("Fail to recognize " + xmlFile.toURI().toURL().toString() + " as being a GATE XML document !", gateDocFormat != null); gateDocFormat.unpackMarkup(gateDoc); // Verfy if all annotations from the default annotation set are consistent gate.corpora.TestDocument.verifyNodeIdConsistency(gateDoc); // Save the size of the document snd the number of annotations long gateDocSize = keyDocument.getContent().size().longValue(); int gateDocAnnotationSetSize = keyDocument.getAnnotations().size(); assertTrue("Exporting as GATE XML resulted in document content size lost." + " Something went wrong.", keyDocumentSize == gateDocSize); assertTrue("Exporting as GATE XML resulted in annotation lost." + " No. of annotations missing = " + Math.abs(keyDocumentAnnotationSetSize - gateDocAnnotationSetSize), keyDocumentAnnotationSetSize == gateDocAnnotationSetSize); // Verifies if the maximum annotation ID on the GATE doc is less than the // Annotation ID generator of the document. verifyAnnotationIDGenerator(gateDoc); //Don't need tmp Gate XML file. xmlFile.delete(); }//runCompleteTestWithAFormat /** A test */ public void testUnpackMarkup() throws Exception{ // create the markupElementsMap map //Map markupElementsMap = null; gate.Document doc = null; /* markupElementsMap = new HashMap(); // populate it markupElementsMap.put ("S","Sentence"); markupElementsMap.put ("s","Sentence"); */ // Create the element2String map Map<String,String> anElement2StringMap = new HashMap<String,String>(); // Populate it anElement2StringMap.put("S","\n"); anElement2StringMap.put("s","\n"); doc = gate.Factory.newDocument(new URL(TestDocument.getTestServerName()+"tests/xml/xces.xml"), workingEncoding); AnnotationSet annotSet = doc.getAnnotations( GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); assertEquals("For "+doc.getSourceUrl()+" the number of annotations"+ " should be:758",758,annotSet.size()); gate.corpora.TestDocument.verifyNodeIdConsistency(doc); // Verifies if the maximum annotation ID on the GATE doc is less than the // Annotation ID generator of the document. verifyAnnotationIDGenerator(doc); } // testUnpackMarkup() /* * This method runs ANNIE with defaults on a document, then saves * it as a GATE XML document and loads it back. All the annotations on the * loaded document should be the same as the original ones. * * It also verifies if the matches feature still holds after an export/import to XML */ public void testAnnotationConsistencyForSaveAsXml()throws Exception{ // Load a document from the test repository //Document origDoc = gate.Factory.newDocument(Gate.getUrl("tests/xml/gateTestSaveAsXML.xml")); String testDoc = gate.util.Files.getGateResourceAsString("gate.ac.uk/tests/xml/gateTestSaveAsXML.xml"); Document origDoc = gate.Factory.newDocument(testDoc); // Verifies if the maximum annotation ID on the origDoc is less than the // Annotation ID generator of the document. verifyAnnotationIDGenerator(origDoc); // Load ANNIE with defaults and run it on the document ConditionalSerialAnalyserController annie = (ConditionalSerialAnalyserController) PersistenceManager.loadObjectFromFile(new File(new File( Gate.getPluginsHome(), ANNIEConstants.PLUGIN_DIR), ANNIEConstants.DEFAULT_FILE)); assertTrue("ANNIE not loaded!", annie != null); Corpus c = Factory.newCorpus("test"); c.add(origDoc); annie.setCorpus(c); annie.execute(); // SaveAS XML and reload the document into another GATE doc // Export the Gate document called origDoc as XML, into a temp file, // using the working encoding File xmlFile = Files.writeTempFile(origDoc.toXml(),workingEncoding); System.out.println("Saved to temp file :" + xmlFile.toURI().toURL()); Document reloadedDoc = gate.Factory.newDocument(xmlFile.toURI().toURL(), workingEncoding); // Verifies if the maximum annotation ID on the origDoc is less than the // Annotation ID generator of the document. verifyAnnotationIDGenerator(reloadedDoc); // Verify if the annotations are identical in the two docs. Map<Integer,Annotation> origAnnotMap = buildID2AnnotMap(origDoc); Map<Integer,Annotation> reloadedAnnMap = buildID2AnnotMap(reloadedDoc); //Verifies if the reloaded annotations are the same as the original ones verifyIDConsistency(origAnnotMap, reloadedAnnMap); // Build the original Matches map // ID -> List of IDs Map<Integer,List<Integer>> origMatchesMap = buildMatchesMap(origDoc); // Verify the consistency of matches // Compare every orig annotation pointed by the MatchesMap with the reloadedAnnot // extracted from the reloadedMAp for(Iterator<Integer> it = origMatchesMap.keySet().iterator(); it.hasNext();){ Integer id = it.next(); Annotation origAnnot = origAnnotMap.get(id); assertTrue("Couldn't find an original annot with ID=" + id, origAnnot != null); Annotation reloadedAnnot = reloadedAnnMap.get(id); assertTrue("Couldn't find a reloaded annot with ID=" + id, reloadedAnnot != null); compareAnnot(origAnnot,reloadedAnnot); // Iterate through the matches list and repeat the comparison List<Integer> matchesList = origMatchesMap.get(id); for (Iterator<Integer> itList = matchesList.iterator(); itList.hasNext();){ Integer matchId = itList.next(); Annotation origA = origAnnotMap.get(matchId); assertTrue("Couldn't find an original annot with ID=" + matchId, origA != null); Annotation reloadedA = reloadedAnnMap.get(matchId); assertTrue("Couldn't find a reloaded annot with ID=" + matchId, reloadedA != null); compareAnnot(origA, reloadedA); }// End for }// End for // Clean up the XMl file xmlFile.delete(); }// End testAnnotationIDConsistencyForSaveAsXml /** * Builds a Map based on the matches feature of some annotations. The goal is to * use this map to validate the annotations from the reloaded document. * In case no Annot has the matches feat, will return an Empty MAP * @param doc The document of which annotations will be used to construct the map * @return A Map from Annot ID -> Lists of Annot IDs */ private Map<Integer,List<Integer>> buildMatchesMap(Document doc){ Map<Integer,List<Integer>> matchesMap = new HashMap<Integer,List<Integer>>(); // Scan the default annotation set AnnotationSet annotSet = doc.getAnnotations(); helperBuildMatchesMap(annotSet, matchesMap); // Scan all named annotation sets if (doc.getNamedAnnotationSets() != null){ for ( Iterator<AnnotationSet> namedAnnotSetsIter = doc.getNamedAnnotationSets().values().iterator(); namedAnnotSetsIter.hasNext(); ){ helperBuildMatchesMap(namedAnnotSetsIter.next(), matchesMap); }// End while }// End if return matchesMap; }// End of buildMatchesMap() /** * This is a helper metod. It scans an annotation set and adds the ID of the annotations * which have the matches feature to the map. * @param sourceAnnotSet The annotation set investigated * @param aMap */ private void helperBuildMatchesMap(AnnotationSet sourceAnnotSet, Map<Integer,List<Integer>> aMap ){ for (Iterator<Annotation> it = sourceAnnotSet.iterator(); it.hasNext();){ Annotation a = it.next(); FeatureMap aFeatMap = a.getFeatures(); // Skip those annotations who don't have features if (aFeatMap == null) continue; // Extract the matches feat @SuppressWarnings("unchecked") List<Integer> matchesVal = (List<Integer>) aFeatMap.get("matches"); if (matchesVal == null) continue; Integer id = a.getId(); aMap.put(id,matchesVal); }//End for }// End of helperBuildMatchesMap() /** * This method tests if the generator for new Annotation IDs is greather than the * maximum Annotation ID present in the GATE document. In oter words, it ensures that * new Annotations will receive an UNIQUE ID. * * @param aDoc The GATE document being tested */ protected void verifyAnnotationIDGenerator(gate.Document aDoc){ // Creates a MAP containing all the annotations of the document. // In doing so, it also tests if there are annotations with the same ID. Map<Integer,Annotation> id2AnnotationMap = buildID2AnnotMap(aDoc); if (id2AnnotationMap == null || id2AnnotationMap.isEmpty()){ //System.out.println("No annotations found on the document! Nothing to test."); return; } // Get the key set of the Map and sort them Set<Integer> keysSet = id2AnnotationMap.keySet(); TreeSet<Integer> sortedSet = new TreeSet<Integer>(keysSet); // Get the highest Annotation ID Integer maxAnnotId = sortedSet.last(); // Compare its value to the one hold by the document's ID generator Integer generatorId = ((DocumentImpl)aDoc).getNextAnnotationId(); // System.out.println("maxAnnotid = " + maxAnnotId + " generatorID = " + generatorId); assertTrue("Annotation ID generator["+generatorId+"] on document [" + aDoc.getSourceUrl() + "] was equal or less than the MAX Annotation ID["+maxAnnotId+"] on the document."+ " This may lead to Annotation ID conflicts.", generatorId.intValue() > maxAnnotId.intValue()); }// End of verifyAnnotationIDGenerator() /** * Verifies if the two maps hold annotations with the same ID. The only thing not checked * are the features, as some of them could be lost in the serialization/deserialization process * @param origAnnotMap A map by ID, containing the original annotations * @param reloadedAnnMap A map by ID, containing the recreated annotations */ private void verifyIDConsistency(Map<Integer,Annotation> origAnnotMap, Map<Integer,Annotation> reloadedAnnMap) { assertEquals("Found a different number of annot in both documents.", origAnnotMap.keySet().size(), reloadedAnnMap.keySet().size()); // List orig = new ArrayList(origAnnotMap.keySet()); // Collections.sort(orig); // System.out.println("ORIG SET =" + orig); // // List rel = new ArrayList(reloadedAnnMap.keySet()); // Collections.sort(rel); // System.out.println("REL SET =" + rel); // for (Iterator<Integer> it = origAnnotMap.keySet().iterator(); it.hasNext();){ Integer id = it.next(); Annotation origAnn = origAnnotMap.get(id); Annotation reloadedAnnot = reloadedAnnMap.get(id); assertTrue("Annotation with ID="+ id +" was not found in the reloaded document.", reloadedAnnot != null); compareAnnot(origAnn, reloadedAnnot); }// End for }// End of verifyIDConsistency() /** * Thes if two annotatiosn are the same, except their features. * @param origAnn * @param reloadedAnnot */ private void compareAnnot(Annotation origAnn, Annotation reloadedAnnot) { assertTrue("Found original and reloaded annot without the same ID!", origAnn.getId().equals(reloadedAnnot.getId())); assertTrue("Found original and reloaded annot without the same TYPE!\n"+ "Original was ["+origAnn.getType()+"] and reloaded was ["+reloadedAnnot.getType()+"].", origAnn.getType().equals(reloadedAnnot.getType())); assertTrue("Found original and reloaded annot without the same START offset!", origAnn.getStartNode().getOffset().equals(reloadedAnnot.getStartNode().getOffset())); assertTrue("Found original and reloaded annot without the same END offset!", origAnn.getEndNode().getOffset().equals(reloadedAnnot.getEndNode().getOffset())); }// End of compareAnnot() private Map<Integer,Annotation> addAnnotSet2Map(AnnotationSet annotSet, Map<Integer,Annotation> id2AnnMap){ for (Iterator<Annotation> it = annotSet.iterator(); it.hasNext();){ Annotation a = it.next(); Integer id = a.getId(); assertTrue("Found two annotations(one with type = " + a.getType() + ")with the same ID=" + id, !id2AnnMap.keySet().contains(id)); id2AnnMap.put(id, a); }// End for return id2AnnMap; } /** * Scans a target Doc for all Annotations and builds a map (from anot ID to annot) in the process * I also checks to see if there are two annotations with the same ID. * @param aDoc The GATE doc to be scaned * @return a Map ID2Annot */ private Map<Integer,Annotation> buildID2AnnotMap(Document aDoc){ Map<Integer,Annotation> id2AnnMap = new HashMap<Integer,Annotation>(); // Scan the default annotation set AnnotationSet annotSet = aDoc.getAnnotations(); addAnnotSet2Map(annotSet, id2AnnMap); // Scan all named annotation sets if (aDoc.getNamedAnnotationSets() != null){ for ( Iterator<AnnotationSet> namedAnnotSetsIter = aDoc.getNamedAnnotationSets().values().iterator(); namedAnnotSetsIter.hasNext(); ){ addAnnotSet2Map(namedAnnotSetsIter.next(), id2AnnMap); }// End while }// End if return id2AnnMap; }// End of buildID2AnnotMap() /** Test suite routine for the test runner */ public static Test suite() { return new TestSuite(TestXml.class); } // suite } // class TestXml