|
Corpus |
|
1 /* 2 * Corpus.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 19/Jan/2000 12 * 13 * $Id: Corpus.java,v 1.19 2002/03/06 17:15:37 kalina Exp $ 14 */ 15 16 package gate; 17 import java.util.*; 18 import java.net.URL; 19 import java.io.FileFilter; 20 import java.io.IOException; 21 22 import gate.util.*; 23 import gate.event.*; 24 import gate.creole.ResourceInstantiationException; 25 26 /** Corpora are lists of Document. TIPSTER equivalent: Collection. 27 */ 28 public interface Corpus extends LanguageResource, List, NameBearer { 29 30 public static final String CORPUS_NAME_PARAMETER_NAME = "name"; 31 public static final String CORPUS_DOCLIST_PARAMETER_NAME = "documentsList"; 32 33 /** 34 * Gets the names of the documents in this corpus. 35 * @return a {@link List} of Strings representing the names of the documents 36 * in this corpus. 37 */ 38 public List getDocumentNames(); 39 40 /** 41 * Gets the name of a document in this corpus. 42 * @param index the index of the document 43 * @return a String value representing the name of the document at 44 * <tt>index</tt> in this corpus. 45 */ 46 public String getDocumentName(int index); 47 48 /** 49 * Unloads the document from memory. Only needed if memory 50 * preservation is an issue. Only supported for Corpus which is 51 * stored in a Datastore. To get this document back in memory, 52 * use get() on Corpus or if you have its persistent ID, request it 53 * from the Factory. 54 * <P> 55 * Transient Corpus objects do nothing, 56 * because there would be no way to get the document back 57 * again afterwards. 58 * @param Document to be unloaded from memory. 59 * @return void. 60 */ 61 public void unloadDocument(Document doc); 62 63 /** 64 * Fills this corpus with documents created on the fly from selected files in 65 * a directory. Uses a link {@FileFilter} to select which files will be used 66 * and which will be ignored. 67 * A simple file filter based on extensions is provided in the Gate 68 * distribution ({@link gate.util.ExtensionFileFilter}). 69 * @param directory the directory from which the files will be picked. This 70 * parameter is an URL for uniformity. It needs to be a URL of type file 71 * otherwise an InvalidArgumentException will be thrown. 72 * An implementation for this method is provided as a static method at 73 * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}. 74 * @param filter the file filter used to select files from the target 75 * directory. If the filter is <tt>null</tt> all the files will be accepted. 76 * @param encoding the encoding to be used for reading the documents 77 * @param recurseDirectories should the directory be parsed recursively?. If 78 * <tt>true</tt> all the files from the provided directory and all its 79 * children directories (on as many levels as necessary) will be picked if 80 * accepted by the filter otherwise the children directories will be ignored. 81 */ 82 public void populate(URL directory, FileFilter filter, 83 String encoding, boolean recurseDirectories) 84 throws IOException, ResourceInstantiationException; 85 86 87 /** 88 * This method returns true when the document is already loaded in memory. 89 * The transient corpora will always return true as they can only contain 90 * documents that are present in the memory. 91 */ 92 public boolean isDocumentLoaded(int index); 93 94 95 /** 96 * Removes one of the listeners registered with this corpus. 97 * @param l the listener to be removed. 98 */ 99 public void removeCorpusListener(CorpusListener l); 100 101 /** 102 * Registers a new {@link CorpusListener} with this corpus. 103 * @param l the listener to be added. 104 */ 105 public void addCorpusListener(CorpusListener l); 106 107 } // interface Corpus 108
|
Corpus |
|