Log in Help
Print
Homegatesrcgate 〉 SimpleCorpus.java
 
/*
 *  SimpleCorpus.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Kalina Bontcheva, 23/Jul/2004
 *
 *  $Id: SimpleCorpus.java 15333 2012-02-07 13:18:33Z ian_roberts $
 */

package gate;

import gate.corpora.DocType;
import gate.creole.ResourceInstantiationException;
import gate.util.NameBearer;

import java.io.FileFilter;
import java.io.IOException;
import java.net.URL;
import java.util.List;

/**
 * Corpora are lists of Document. TIPSTER equivalent: Collection.
 */
public interface SimpleCorpus extends LanguageResource, List<Document>, NameBearer {

  public static final String CORPUS_NAME_PARAMETER_NAME = "name";

  public static final String CORPUS_DOCLIST_PARAMETER_NAME = "documentsList";

  /**
   * Gets the names of the documents in this corpus.
   * 
   * @return a {@link List} of Strings representing the names of the
   *         documents in this corpus.
   */
  public List<String> getDocumentNames();

  /**
   * Gets the name of a document in this corpus.
   * 
   * @param index the index of the document
   * @return a String value representing the name of the document at
   *         <tt>index</tt> in this corpus.
   */
  public String getDocumentName(int index);

  /**
   * Fills this corpus with documents created on the fly from selected
   * files in a directory. Uses a {@link FileFilter} to select which
   * files will be used and which will be ignored. A simple file filter
   * based on extensions is provided in the Gate distribution (
   * {@link gate.util.ExtensionFileFilter}).
   * 
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown. An implementation for this method is
   *          provided as a static method at
   *          {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
   *          .
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is <tt>null</tt> all the files
   *          will be accepted.
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If <tt>true</tt> all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   */
  public void populate(URL directory, FileFilter filter, String encoding,
          boolean recurseDirectories) throws IOException,
          ResourceInstantiationException;

  /**
   * Fills this corpus with documents created on the fly from selected
   * files in a directory. Uses a {@link FileFilter} to select which
   * files will be used and which will be ignored. A simple file filter
   * based on extensions is provided in the Gate distribution (
   * {@link gate.util.ExtensionFileFilter}).
   * 
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown. An implementation for this method is
   *          provided as a static method at
   *          {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
   *          .
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is <tt>null</tt> all the files
   *          will be accepted.
   * @param encoding the encoding to be used for reading the documents
   *@param mimeType the mime type to be used when loading documents. If
   *          null, then the mime type will be automatically determined.
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If <tt>true</tt> all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   */
  public void populate(URL directory, FileFilter filter, String encoding,
          String mimeType, boolean recurseDirectories) throws IOException,
          ResourceInstantiationException;

  /**
   * Fills the provided corpus with documents extracted from the
   * provided trec file.
   * 
   * @param singleConcatenatedFile the file with multiple documents in it.
   * @param documentRootElement content between the start and end of
   *          this element is considered for documents.
   * @param encoding the encoding of the trec file.
   * @param numberOfDocumentsToExtract indicates the number of documents to
   *          extract from the concatenated file. -1 to indicate all
   *          files.
   * @param documentNamePrefix the prefix to use for document names when
   *          creating from
   * @param documentType type of the document (i.e. xml, html  etc.) 
   * @return total length of populated documents in the corpus in number
   *         of bytes
   */
  public long populate(URL singleConcatenatedFile, String documentRootElement,
          String encoding, int numberOfDocumentsToExtract,
          String documentNamePrefix, DocType documentType) throws IOException,
          ResourceInstantiationException;

} // interface SimpleCorpus