|
CorpusImpl |
|
1 /* 2 * CorpusImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: CorpusImpl.java,v 1.52 2003/03/31 05:05:07 valyt Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 20 import gate.*; 21 import gate.util.*; 22 import gate.annotation.*; 23 import gate.persist.*; 24 import java.io.*; 25 import java.net.*; 26 import gate.event.*; 27 import gate.creole.*; 28 29 /** Corpora are sets of Document. They are ordered by lexicographic collation 30 * on Url. 31 */ 32 public class CorpusImpl extends AbstractLanguageResource 33 implements Corpus, CreoleListener { 34 35 /** Debug flag */ 36 private static final boolean DEBUG = false; 37 38 public CorpusImpl(){ 39 supportList = Collections.synchronizedList(new VerboseList()); 40 Gate.getCreoleRegister().addCreoleListener(this); 41 } 42 43 44 /** 45 * Gets the names of the documents in this corpus. 46 * @return a {@link List} of Strings representing the names of the documents 47 * in this corpus. 48 */ 49 public List getDocumentNames(){ 50 ArrayList res = new ArrayList(supportList.size()); 51 Iterator docIter = supportList.iterator(); 52 while(docIter.hasNext()){ 53 res.add(((Document)docIter.next()).getName()); 54 } 55 return res; 56 } 57 58 /** 59 * Gets the name of a document in this corpus. 60 * @param index the index of the document 61 * @return a String value representing the name of the document at 62 * <tt>index</tt> in this corpus. 63 */ 64 public String getDocumentName(int index){ 65 return ((Document)supportList.get(index)).getName(); 66 } 67 68 /** 69 * This method does not make sense for transient corpora, so it does 70 * nothing. 71 */ 72 public void unloadDocument(Document doc) { 73 return; 74 } 75 76 77 /** 78 * The underlying list that holds the documents in this corpus. 79 */ 80 protected List supportList = null; 81 82 /** 83 * A proxy list that stores the actual data in an internal list and forwards 84 * all operations to that one but it also fires the appropiate corpus events 85 * when necessary. 86 * It also does some type checking so only Documents are accepted as corpus 87 * members. 88 */ 89 protected class VerboseList extends AbstractList implements Serializable{ 90 91 92 93 VerboseList(){ 94 data = new ArrayList(); 95 } 96 97 public Object get(int index){ 98 return data.get(index); 99 } 100 101 public int size(){ 102 return data.size(); 103 } 104 105 public Object set(int index, Object element){ 106 if(element instanceof Document){ 107 Document oldDoc = (Document)data.set(index, element); 108 Document newDoc = (Document)element; 109 110 //fire the 2 events 111 fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, 112 oldDoc, 113 index, 114 CorpusEvent.DOCUMENT_REMOVED)); 115 fireDocumentAdded(new CorpusEvent(CorpusImpl.this, 116 newDoc, 117 index, 118 CorpusEvent.DOCUMENT_ADDED)); 119 return oldDoc; 120 }else{ 121 throw new UnsupportedOperationException( 122 getClass().getName() + 123 " only accepts gate.Document values as members!\n" + 124 element.getClass().getName() + " is not a gate.Document"); 125 } 126 } 127 128 public void add(int index, Object element){ 129 if(element instanceof Document){ 130 data.add(index, element); 131 132 //fire the event 133 fireDocumentAdded(new CorpusEvent(CorpusImpl.this, 134 (Document)element, 135 index, 136 CorpusEvent.DOCUMENT_ADDED)); 137 }else{ 138 throw new UnsupportedOperationException( 139 getClass().getName() + 140 " only accepts gate.Document values as members!\n" + 141 element.getClass().getName() + " is not a gate.Document"); 142 } 143 } 144 145 public Object remove(int index){ 146 Document oldDoc = (Document)data.remove(index); 147 148 fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, 149 oldDoc, 150 index, 151 CorpusEvent.DOCUMENT_REMOVED)); 152 return oldDoc; 153 } 154 155 /** 156 * The List containing the actual data. 157 */ 158 ArrayList data; 159 } 160 161 /** 162 * This method returns true when the document is already loaded in memory 163 */ 164 public boolean isDocumentLoaded(int index) { 165 return true; 166 } 167 168 169 protected void clearDocList() { 170 if (supportList == null) 171 return; 172 supportList.clear(); 173 } 174 175 176 //List methods 177 //java docs will be automatically copied from the List interface. 178 179 public int size() { 180 return supportList.size(); 181 } 182 183 public boolean isEmpty() { 184 return supportList.isEmpty(); 185 } 186 187 public boolean contains(Object o){ 188 return supportList.contains(o); 189 } 190 191 public Iterator iterator(){ 192 return supportList.iterator(); 193 } 194 195 public Object[] toArray(){ 196 return supportList.toArray(); 197 } 198 199 public Object[] toArray(Object[] a){ 200 return supportList.toArray(a); 201 } 202 203 public boolean add(Object o){ 204 return supportList.add(o); 205 } 206 207 public boolean remove(Object o){ 208 return supportList.remove(o); 209 } 210 211 public boolean containsAll(Collection c){ 212 return supportList.containsAll(c); 213 } 214 215 public boolean addAll(Collection c){ 216 return supportList.addAll(c); 217 } 218 219 public boolean addAll(int index, Collection c){ 220 return supportList.addAll(index, c); 221 } 222 223 public boolean removeAll(Collection c){ 224 return supportList.removeAll(c); 225 } 226 227 public boolean retainAll(Collection c){ 228 return supportList.retainAll(c); 229 } 230 231 public void clear(){ 232 supportList.clear(); 233 } 234 235 public boolean equals(Object o){ 236 if (! (o instanceof CorpusImpl)) 237 return false; 238 239 return supportList.equals(o); 240 } 241 242 public int hashCode(){ 243 return supportList.hashCode(); 244 } 245 246 public Object get(int index){ 247 return supportList.get(index); 248 } 249 250 public Object set(int index, Object element){ 251 return supportList.set(index, element); 252 } 253 254 public void add(int index, Object element){ 255 supportList.add(index, element); 256 } 257 258 public Object remove(int index){ 259 return supportList.remove(index); 260 } 261 262 public int indexOf(Object o){ 263 return supportList.indexOf(o); 264 } 265 266 public int lastIndexOf(Object o){ 267 return lastIndexOf(o); 268 } 269 270 public ListIterator listIterator(){ 271 return supportList.listIterator(); 272 } 273 274 public ListIterator listIterator(int index){ 275 return supportList.listIterator(index); 276 } 277 278 public List subList(int fromIndex, int toIndex){ 279 return supportList.subList(fromIndex, toIndex); 280 } 281 282 283 /** Construction */ 284 285 public void cleanup(){ 286 Gate.getCreoleRegister().removeCreoleListener(this); 287 } 288 289 /** Initialise this resource, and return it. */ 290 public Resource init() { 291 if(documentsList != null && !documentsList.isEmpty()){ 292 addAll(documentsList); 293 } 294 return this; 295 } // init() 296 297 298 /** 299 * Fills the provided corpus with documents created on the fly from selected 300 * files in a directory. Uses a link {@FileFilter} to select which files will 301 * be used and which will be ignored. 302 * A simple file filter based on extensions is provided in the Gate 303 * distribution ({@link gate.util.ExtensionFileFilter}). 304 * @param corpus the corpus to be populated 305 * @param directory the directory from which the files will be picked. This 306 * parameter is an URL for uniformity. It needs to be a URL of type file 307 * otherwise an InvalidArgumentException will be thrown. 308 * @param filter the file filter used to select files from the target 309 * directory. If the filter is <tt>null</tt> all the files will be accepted. 310 * @param encoding the encoding to be used for reading the documents 311 * @param recurseDirectories should the directory be parsed recursively?. If 312 * <tt>true</tt> all the files from the provided directory and all its 313 * children directories (on as many levels as necessary) will be picked if 314 * accepted by the filter otherwise the children directories will be ignored. 315 */ 316 public static void populate(Corpus corpus, URL directory, FileFilter filter, 317 String encoding, boolean recurseDirectories) 318 throws IOException { 319 //check input 320 if(!directory.getProtocol().equalsIgnoreCase("file")) 321 throw new IllegalArgumentException( 322 "The URL provided is not of type \"file:\"!"); 323 324 File dir = new File(directory.getPath()); 325 if(!dir.exists()) 326 throw new FileNotFoundException(dir.toString()); 327 328 if(!dir.isDirectory()) 329 throw new IllegalArgumentException( 330 dir.getAbsolutePath() + " is not a directory!"); 331 332 //populate the corpus 333 File[] files = dir.listFiles(filter); 334 if(files != null){ 335 for(int i = 0; i < files.length; i++){ 336 File aFile = files[i]; 337 if(aFile.isDirectory()){ 338 //recurse dir if required 339 if(recurseDirectories){ 340 populate(corpus, aFile.toURL(), filter, 341 encoding, recurseDirectories); 342 } 343 }else{ 344 //create the doc 345 StatusListener sListener = (StatusListener) 346 gate.gui.MainFrame.getListeners(). 347 get("gate.event.StatusListener"); 348 if(sListener != null) sListener.statusChanged( 349 "Reading: " + aFile.getName()); 350 String docName = aFile.getName() + "_" + Gate.genSym(); 351 FeatureMap params = Factory.newFeatureMap(); 352 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, aFile.toURL()); 353 if(encoding != null) 354 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding); 355 356 try { 357 Document doc = (Document)Factory.createResource( 358 DocumentImpl.class.getName(), params, null, docName 359 ); 360 corpus.add(doc); 361 if(corpus.getLRPersistenceId() != null){ 362 //persistent corpus -> unload the document 363 corpus.unloadDocument(doc); 364 Factory.deleteResource(doc); 365 } 366 } catch(ResourceInstantiationException e) { 367 String nl = Strings.getNl(); 368 Err.prln( 369 "WARNING: Corpus.populate could not intantiate document" + nl + 370 " Document name was: " + docName + nl + 371 " Exception was: " + e + nl + nl 372 ); 373 } 374 if(sListener != null) sListener.statusChanged( 375 aFile.getName() + " read"); 376 } 377 } 378 } 379 }//public static void populate 380 381 /** 382 * Fills this corpus with documents created from files in a directory. 383 * @param filter the file filter used to select files from the target 384 * directory. If the filter is <tt>null</tt> all the files will be accepted. 385 * @param directory the directory from which the files will be picked. This 386 * parameter is an URL for uniformity. It needs to be a URL of type file 387 * otherwise an InvalidArgumentException will be thrown. 388 * An implementation for this method is provided as a static method at 389 * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}. 390 * @param encoding the encoding to be used for reading the documents 391 * @param recurseDirectories should the directory be parsed recursively?. If 392 * <tt>true</tt> all the files from the provided directory and all its 393 * children directories (on as many levels as necessary) will be picked if 394 * accepted by the filter otherwise the children directories will be ignored. 395 */ 396 public void populate(URL directory, FileFilter filter, String encoding, 397 boolean recurseDirectories) 398 throws IOException, ResourceInstantiationException{ 399 populate(this, directory, filter, encoding, recurseDirectories); 400 } 401 402 public synchronized void removeCorpusListener(CorpusListener l) { 403 if (corpusListeners != null && corpusListeners.contains(l)) { 404 Vector v = (Vector) corpusListeners.clone(); 405 v.removeElement(l); 406 corpusListeners = v; 407 } 408 } 409 public synchronized void addCorpusListener(CorpusListener l) { 410 Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone(); 411 if (!v.contains(l)) { 412 v.addElement(l); 413 corpusListeners = v; 414 } 415 } 416 417 /** Freeze the serialization UID. */ 418 static final long serialVersionUID = -1113142759053898456L; 419 private transient Vector corpusListeners; 420 protected transient java.util.List documentsList; 421 422 423 protected void fireDocumentAdded(CorpusEvent e) { 424 if (corpusListeners != null) { 425 Vector listeners = corpusListeners; 426 int count = listeners.size(); 427 for (int i = 0; i < count; i++) { 428 ((CorpusListener) listeners.elementAt(i)).documentAdded(e); 429 } 430 } 431 } 432 protected void fireDocumentRemoved(CorpusEvent e) { 433 if (corpusListeners != null) { 434 Vector listeners = corpusListeners; 435 int count = listeners.size(); 436 for (int i = 0; i < count; i++) { 437 ((CorpusListener) listeners.elementAt(i)).documentRemoved(e); 438 } 439 } 440 } 441 public void setDocumentsList(java.util.List documentsList) { 442 this.documentsList = documentsList; 443 } 444 public java.util.List getDocumentsList() { 445 return documentsList; 446 } 447 public void resourceLoaded(CreoleEvent e) { 448 } 449 public void resourceUnloaded(CreoleEvent e) { 450 Resource res = e.getResource(); 451 //remove all occurences 452 if(res instanceof Document) while(contains(res)) remove(res); 453 } 454 455 public void resourceRenamed(Resource resource, String oldName, 456 String newName){ 457 } 458 459 public void datastoreOpened(CreoleEvent e) { 460 } 461 public void datastoreCreated(CreoleEvent e) { 462 } 463 public void datastoreClosed(CreoleEvent e) { 464 } 465 } // class CorpusImpl 466
|
CorpusImpl |
|