/* * TestDocument.java * * Copyright (c) 1995-2012, The University of Sheffield. See the file * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * * Hamish Cunningham, 21/Jan/00 * * $Id: TestDocument.java 17656 2014-03-14 08:55:23Z markagreenwood $ */ package gate.corpora; import java.io.*; import java.net.URL; import java.net.UnknownHostException; import java.util.*; import junit.framework.*; import gate.*; import gate.util.BomStrippingInputStreamReader; import gate.util.Err; import gate.util.SimpleFeatureMapImpl; /** Tests for the Document classes */ public class TestDocument extends TestCase { /** Construction */ public TestDocument(String name) { super(name); setUp();} /** Base of the test server URL */ protected static String testServer = null; /** Name of test document 1 */ protected String testDocument1; /** Fixture set up */ @Override public void setUp() { //try{ // Gate.init(); //testServer = Gate.getUrl().toExternalForm(); testServer = getTestServerName(); //} catch (GateException e){ // e.printStackTrace(Err.getPrintWriter()); //} testDocument1 = "tests/html/test2.htm"; } // setUp /** Get the name of the test server */ public static String getTestServerName() { if(testServer != null) return testServer; else { try { URL url = Gate.getClassLoader().getResource("gate/resources/gate.ac.uk/"); testServer = url.toExternalForm(); } catch(Exception e) { } return testServer; } } /** Test ordering */ public void testCompareTo() throws Exception{ Document doc1 = null; Document doc2 = null; Document doc3 = null; doc1 = Factory.newDocument(new URL(testServer + "tests/def")); doc2 = Factory.newDocument(new URL(testServer + "tests/defg")); doc3 = Factory.newDocument(new URL(testServer + "tests/abc")); assertTrue(doc1.compareTo(doc2) < 0); assertTrue(doc1.compareTo(doc1) == 0); assertTrue(doc1.compareTo(doc3) > 0); } // testCompareTo() /** Test loading of the original document content */ public void testOriginalContentPreserving() throws Exception { Document doc = null; FeatureMap params; String encoding = "UTF-8"; String origContent; // test the default value of preserve content flag params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_URL_PARAMETER_NAME, new URL(testServer + testDocument1)); params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding); doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); origContent = (String) doc.getFeatures().get( GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); assertNull( "The original content should not be preserved without demand.", origContent); params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_URL_PARAMETER_NAME, new URL(testServer + testDocument1)); params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding); params.put(Document.DOCUMENT_PRESERVE_CONTENT_PARAMETER_NAME, new Boolean(true)); doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); origContent = (String) doc.getFeatures().get( GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); assertNotNull("The original content is not preserved on demand.", origContent); assertTrue("The original content size is zerro.", origContent.length()>0); } // testOriginalContentPreserving() /** A comprehensive test */ public void testLotsOfThings() { // check that the test URL is available URL u = null; try{ u = new URL(testServer + testDocument1); } catch (Exception e){ e.printStackTrace(Err.getPrintWriter()); } // get some text out of the test URL BufferedReader uReader = null; try { uReader = new BomStrippingInputStreamReader(u.openStream()); assertEquals(uReader.readLine(), "<HTML>"); } catch(UnknownHostException e) { // no network connection return; } catch(IOException e) { fail(e.toString()); } /* Document doc = new TextualDocument(testServer + testDocument1); AnnotationGraph ag = new AnnotationGraphImpl(); Tokeniser t = ... doc.getContent() tokenise doc using java stream tokeniser add several thousand token annotation select a subset */ } // testLotsOfThings public void testDocRender() throws Exception { Document doc = Factory.newDocument("Hi Mom"); doc.getAnnotations().add(new Long(0), new Long(2), "Foo", new SimpleFeatureMapImpl()); String content = doc.toXml(doc.getAnnotations(), false); // Will fail, content is "<Foo>Hi Mom</Foo>" assertEquals("<Foo>Hi</Foo> Mom", content); } /** The reason this is method begins with verify and not with test is that it * gets called by various other test methods. It is somehow a utility test * method. It should be called on all gate documents having annotation sets. */ public static void verifyNodeIdConsistency(gate.Document doc)throws Exception{ if (doc == null) return; Map<Long,Integer> offests2NodeId = new HashMap<Long,Integer>(); // Test the default annotation set AnnotationSet annotSet = doc.getAnnotations(); verifyNodeIdConsistency(annotSet,offests2NodeId, doc); // Test all named annotation sets if (doc.getNamedAnnotationSets() != null){ Iterator<AnnotationSet> namedAnnotSetsIter = doc.getNamedAnnotationSets().values().iterator(); while(namedAnnotSetsIter.hasNext()){ verifyNodeIdConsistency(namedAnnotSetsIter.next(),offests2NodeId,doc); }// End while }// End if // Test suceeded. The map is not needed anymore. offests2NodeId = null; }// verifyNodeIdConsistency(); /** This metod runs the test over an annotation Set. It is called from her * older sister. Se above. * @param annotSet is the annotation set being tested. * @param offests2NodeId is the Map used to test the consistency. * @param doc is used in composing the assert error messsage. */ public static void verifyNodeIdConsistency(gate.AnnotationSet annotSet, Map<Long,Integer> offests2NodeId, gate.Document doc) throws Exception{ if (annotSet == null || offests2NodeId == null) return; Iterator<Annotation> iter = annotSet.iterator(); while(iter.hasNext()){ Annotation annot = iter.next(); String annotSetName = (annotSet.getName() == null)? "Default": annotSet.getName(); // check the Start node if (offests2NodeId.containsKey(annot.getStartNode().getOffset())){ assertEquals("Found two different node IDs for the same offset( "+ annot.getStartNode().getOffset()+ " ).\n" + "START NODE is buggy for annotation(" + annot + ") from annotation set " + annotSetName + " of GATE document :" + doc.getSourceUrl(), annot.getStartNode().getId(), offests2NodeId.get(annot.getStartNode().getOffset())); }// End if // Check the End node if (offests2NodeId.containsKey(annot.getEndNode().getOffset())){ assertEquals("Found two different node IDs for the same offset("+ annot.getEndNode().getOffset()+ ").\n" + "END NODE is buggy for annotation(" + annot+ ") from annotation"+ " set " + annotSetName +" of GATE document :" + doc.getSourceUrl(), annot.getEndNode().getId(), offests2NodeId.get(annot.getEndNode().getOffset())); }// End if offests2NodeId.put(annot.getStartNode().getOffset(), annot.getStartNode().getId()); offests2NodeId.put(annot.getEndNode().getOffset(), annot.getEndNode().getId()); }// End while }//verifyNodeIdConsistency(); /** * Test to verify behaviour of the mimeType init parameter. */ public void testExplicitMimeType() throws Exception { // override the user config to make sure we DON'T add extra space on // unpackMarkup when parsing XML, whatever is set in the user config file. Object savedAddSpaceValue = Gate.getUserConfig().get( GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME); Gate.getUserConfig().put( GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME, "false"); try { String testXmlString = "<p>This is a <strong>TEST</strong>.</p>"; String xmlParsedContent = "This is a TEST."; String htmlParsedContent = "This is a TEST.\n"; // if we create a Document from this string WITHOUT setting a mime type, // it should be treated as plain text and not parsed. FeatureMap docParams = Factory.newFeatureMap(); docParams.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, testXmlString); docParams.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME, Boolean.TRUE); Document noMimeTypeDoc = (Document)Factory.createResource( DocumentImpl.class.getName(), docParams); assertEquals("Document created with no explicit mime type should have " + "unparsed XML as content.", testXmlString, noMimeTypeDoc.getContent().toString()); assertEquals("Document created with no explicit mime type should not " + "have any Original markups annotations.", 0, noMimeTypeDoc.getAnnotations( GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).size()); Factory.deleteResource(noMimeTypeDoc); noMimeTypeDoc = null; // if we create the same document with an explicit mime type of text/xml, // it should be parsed properly, and have two original markups // annotations. docParams.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, "text/xml"); Document xmlDoc = (Document)Factory.createResource( DocumentImpl.class.getName(), docParams); assertEquals("Document created with explicit mime type should have been " + "parsed as XML.", xmlParsedContent, xmlDoc.getContent().toString()); assertEquals("Document created with explicit mime type has wrong number " + "of Original markups annotations.", 2, xmlDoc.getAnnotations( GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).size()); Factory.deleteResource(xmlDoc); xmlDoc = null; // if we create the same document with an explicit mime type of text/html, // it should be parsed properly and have *4* original markups // annotations, as the HTML parser creates enclosing <html> and <body> // elements and a zero-length <head> annotation. docParams.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, "text/html"); Document htmlDoc = (Document)Factory.createResource( DocumentImpl.class.getName(), docParams); assertEquals("Document created with explicit mime type should have been " + "parsed as HTML.", htmlParsedContent, htmlDoc.getContent().toString()); assertEquals("Document created with explicit mime type has wrong number " + "of Original markups annotations.", 5, htmlDoc.getAnnotations( GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).size()); Factory.deleteResource(htmlDoc); htmlDoc = null; } finally { // restore the saved value for ADD_SPACE_ON_MARKUP_UNPACK if(savedAddSpaceValue == null) { Gate.getUserConfig().remove( GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME); } else { Gate.getUserConfig().put( GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME, savedAddSpaceValue); } } } /** Test suite routine for the test runner */ public static Test suite() { return new TestSuite(TestDocument.class); } // suite } // class TestDocument