|
TextualDocumentFormat |
|
1 /* 2 * TextualDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: TextualDocumentFormat.java,v 1.21 2002/07/05 08:54:08 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 21 import gate.util.*; 22 import gate.*; 23 import gate.creole.*; 24 25 import org.w3c.www.mime.*; 26 27 /** The format of Documents. Subclasses of DocumentFormat know about 28 * particular MIME types and how to unpack the information in any 29 * markup or formatting they contain into GATE annotations. Each MIME 30 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 31 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 32 * with a static index residing here when they are constructed. Static 33 * getDocumentFormat methods can then be used to get the appropriate 34 * format class for a particular document. 35 */ 36 public class TextualDocumentFormat extends DocumentFormat 37 { 38 39 /** Debug flag */ 40 private static final boolean DEBUG = false; 41 42 /** Default construction */ 43 public TextualDocumentFormat() { super(); } 44 45 /** Initialise this resource, and return it. */ 46 public Resource init() throws ResourceInstantiationException{ 47 // Register plain text mime type 48 MimeType mime = new MimeType("text","plain"); 49 // Register the class handler for this mime type 50 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 51 this); 52 // Register the mime type with mine string 53 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 54 // Register file sufixes for this mime type 55 suffixes2mimeTypeMap.put("txt",mime); 56 suffixes2mimeTypeMap.put("text",mime); 57 // Set the mimeType for this language resource 58 setMimeType(mime); 59 return this; 60 } // init() 61 62 /** Unpack the markup in the document. This converts markup from the 63 * native format (e.g. XML, RTF) into annotations in GATE format. 64 * Uses the markupElementsMap to determine which elements to convert, and 65 * what annotation type names to use. 66 */ 67 public void unpackMarkup(Document doc) throws DocumentFormatException{ 68 if (doc == null || doc.getContent() == null) return; 69 setNewLineProperty(doc); 70 // Create paragraph annotations in the specified annotation set 71 int endOffset = doc.getContent().toString().length(); 72 int startOffset = 0; 73 annotateParagraphs(doc,startOffset,endOffset, 74 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 75 }//unpackMarkup 76 77 public void unpackMarkup(Document doc, RepositioningInfo repInfo, 78 RepositioningInfo ampCodingInfo) 79 throws DocumentFormatException { 80 unpackMarkup(doc); 81 } // unpackMarkup 82 83 84 /** 85 * Check the new line sequence and set document property. 86 * <BR> 87 * Possible values are CRLF, LFCR, CR, LF 88 */ 89 protected void setNewLineProperty(Document doc) { 90 String content = doc.getContent().toString(); 91 String newLineType = ""; 92 93 char ch = ' '; 94 char lastch = ' '; 95 for(int i=0; i < content.length(); ++i) { 96 ch = content.charAt(i); 97 if(lastch == '\r') { 98 if(ch == '\n') { 99 newLineType = "CRLF"; 100 break; 101 } 102 else { 103 newLineType = "CR"; 104 break; 105 } 106 } 107 if(lastch == '\n') { 108 if(ch == '\r') { 109 newLineType = "LFCR"; 110 break; 111 } 112 else { 113 newLineType = "LF"; 114 break; 115 } 116 } 117 lastch = ch; 118 } // for 119 120 doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType); 121 } // setNewLineProperty() 122 123 /** Delete '\r' in combination CRLF or LFCR in document content */ 124 private void removeExtraNewLine(Document doc) { 125 String content = doc.getContent().toString(); 126 StringBuffer buff = new StringBuffer(content); 127 128 char ch = ' '; 129 char lastch = ' '; 130 for(int i=content.length()-1; i > -1; --i) { 131 ch = content.charAt(i); 132 if(ch == '\n' && lastch == '\r') { 133 buff.deleteCharAt(i+1); 134 } 135 if(ch == '\r' && lastch == '\n') { 136 buff.deleteCharAt(i); 137 ch = lastch; 138 } 139 lastch = ch; 140 } // for 141 142 doc.setContent(new DocumentContentImpl(buff.toString())); 143 } // removeExtraNewLine(Document doc) 144 145 /** This method annotates paragraphs in a GATE document. The investigated text 146 * spans beetween start and end offsets and the paragraph annotations are 147 * created in the annotSetName. If annotSetName is null then they are creted 148 * in the default annotation set. 149 * @param aDoc is the gate document on which the paragraph detection would 150 * be performed.If it is null or its content it's null then the method woul 151 * simply return doing nothing. 152 * @param startOffset is the index form the document content from which the 153 * paragraph detection will start 154 * @param endOffset is the offset where the detection will end. 155 * @param annotSetName is the name of the set in which paragraph annotation 156 * would be created.The annotation type created will be "paragraph" 157 */ 158 public void annotateParagraphs(Document aDoc,int startOffset,int endOffset, 159 String annotSetName)throws DocumentFormatException{ 160 // Simply return if the document is null or its content 161 if (aDoc == null || aDoc.getContent() == null) return; 162 // Simply return if the start is > than the end 163 if (startOffset > endOffset) return; 164 // Decide where to put the newly detected annotations 165 AnnotationSet annotSet = null; 166 if (annotSetName == null) 167 annotSet = aDoc.getAnnotations(); 168 else 169 annotSet = aDoc.getAnnotations(annotSetName); 170 // Extract the document content 171 String content = aDoc.getContent().toString(); 172 // This is the offset marking the start of a para 173 int startOffsetPara = startOffset; 174 // This marks the ned of a para 175 int endOffsetPara = endOffset; 176 // The initial sate of the FSA 177 int state = 1; 178 // This field marks that a BR entity was read 179 // A BR entity can be NL or NL CR, depending on the operating system (UNIX 180 // or DOS) 181 boolean readBR = false; 182 int index = startOffset; 183 while (index < endOffset){ 184 // Read the current char 185 char ch = content.charAt(index); 186 // Test if a BR entity was read 187 if (ch =='\n'){ 188 readBR = true; 189 // If \n is followed by a \r then advance the index in order to read a 190 // BR entity 191 while ((index+1 < endOffset) && (content.charAt(index+1) == '\r')) 192 index ++; 193 }// End if 194 switch(state){ 195 // It is the initial and also a final state 196 // Stay in state 1 while it reads whitespaces 197 case 1:{ 198 // If reads a non whitespace char then move to state 2 and record 199 // the beggining of a paragraph 200 if (!Character.isWhitespace(ch)){ 201 state = 2; 202 startOffsetPara = index; 203 }// End if 204 }break; 205 // It can be also a final state. 206 case 2:{ 207 // Stay in state 2 while reading chars != BR entities 208 if (readBR){ 209 // If you find a BR char go to state 3. The possible end of the para 210 // can be index. This will be confirmed by state 3. So, this is why 211 // the end of a para is recorded here. 212 readBR = false; 213 endOffsetPara = index; 214 state = 3; 215 }// End if 216 }break; 217 // It can be also a final state 218 // From state 3 there are only 2 possible ways: (state 2 or state1) 219 // In state 1 it needs to read a BR 220 // For state 2 it nead to read something different then a BR 221 case 3:{ 222 if (readBR){ 223 // A BR was read. Go to state 1 224 readBR = false; 225 state = 1; 226 // Create an annotation type paragraph 227 try{ 228 annotSet.add( new Long(startOffsetPara), 229 new Long(endOffsetPara), 230 "paragraph", 231 Factory.newFeatureMap()); 232 } catch (gate.util.InvalidOffsetException ioe){ 233 throw new DocumentFormatException("Coudn't create a paragraph"+ 234 " annotation",ioe); 235 }// End try 236 }else{ 237 // Go to state 2 an keep reading chars 238 state = 2; 239 }// End if 240 }break; 241 }// End switch 242 // Prepare to read the next char. 243 index ++; 244 }// End while 245 endOffsetPara = index; 246 // Investigate where the finite automata has stoped 247 if ( state==2 || state==3 ){ 248 // Create an annotation type paragraph 249 try{ 250 annotSet.add( new Long(startOffsetPara), 251 // Create the final annotation using the endOffset 252 new Long(endOffsetPara), 253 "paragraph", 254 Factory.newFeatureMap()); 255 } catch (gate.util.InvalidOffsetException ioe){ 256 throw new DocumentFormatException("Coudn't create a paragraph"+ 257 " annotation",ioe); 258 }// End try 259 }// End if 260 }// End annotateParagraphs(); 261 262 public DataStore getDataStore(){ return null;} 263 264 } // class TextualDocumentFormat 265
|
TextualDocumentFormat |
|