Log in Help
Print
Homereleasesgate-5.1-beta2-build3402-ALLpluginsMachine_Learningsrcgatecreolemlsvmlight 〉 MyStringReader.java
 
/*
 *  Copyright (c) 2004, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Mike Dowman 08-04-2004
 *
 *  $Id: MyStringReader.java 7452 2006-06-15 14:45:17 +0000 (Thu, 15 Jun 2006) ian_roberts $
 *
 */

package gate.creole.ml.svmlight;

class MyStringReader {
  static final boolean DEBUG=false;

  String inputFile;
  int indexInInputFile;

  /**
   * Initialises the string reader by reading a whole file and putting it in a
   * string for easy access. Initialises the index to point at the beginning of
   * the string.
   *
   * @param reader The file reader to read the file from.
   * @throws IOException if there's an error reading the file.
   */
  public MyStringReader(java.io.FileReader reader) throws java.io.IOException {
    StringBuffer wholeFile=new StringBuffer();
    while (reader.ready()) {
      wholeFile.append((char)reader.read());
    }
    inputFile=wholeFile.toString();
    indexInInputFile=0;

    if (DEBUG) {
      System.out.println("A MyStringReader has been initialised with the "+
                         "following contents:\n"+inputFile);
    }
  }

  /**
   * Skip over any lines which are just comments.
   */
  public void skipLeadingComments() {
    // Keep skipping lines till we find one on which the first white space
    // character is not the comment symbol (#).
    while (true) {
      skipWhiteSpace();
      if (indexInInputFile<inputFile.length()
          && inputFile.charAt(indexInInputFile) == '#')
        skipToStartOfNextLine();
      else
        return;
    }
  }

  /**
   * Skip over any white space (spaces or tabs).
   */
  public void skipWhiteSpace() {
    while (indexInInputFile<inputFile.length()
           && (inputFile.charAt(indexInInputFile)==' '
           || inputFile.charAt(indexInInputFile)=='\t')) {
      ++indexInInputFile;
    }
  }

  /**
   * Skip over any white space, including new lines.
   */
  public void skipBlankLinesAndWhiteSpace() {
    while (indexInInputFile<inputFile.length()
           && (inputFile.charAt(indexInInputFile)==' '
               || inputFile.charAt(indexInInputFile)=='\t'
               || inputFile.charAt(indexInInputFile)=='\n')) {
      ++indexInInputFile;
    }
  }

  /**
   * Move on to the first character after the end of the line (or possibly after
   * the end of the file).
   */
  public void skipToStartOfNextLine() {
    int indexOfStartOfNextLine=inputFile.indexOf('\n', indexInInputFile)+1;
    // If there are no more lines in the file, just point to the first character
    // after the end of the file.
    if (indexOfStartOfNextLine==-1)
      indexInInputFile=inputFile.length();
    else
      indexInInputFile=indexOfStartOfNextLine;
  }

  /**
   * See if we've reached the end of the file.
   * @return true if we've got to the end of the file, false otherwise.
   */
  public boolean endOfFileReached() {
    return indexInInputFile==inputFile.length();
  }

  /**
   * Read a string from the file, starting at the first non-white space
   * character at or after the current position, and ending at the last
   * non-white space character after that.
   */
  public String readItem() {
    skipWhiteSpace();
    // First make sure that there is an item to read.
    if (indexInInputFile==inputFile.length()
        || inputFile.charAt(indexInInputFile)=='\n')
      throw new gate.util.GateRuntimeException("Error when reading from file"+
        " at character index "+indexInInputFile+". \nEnd of line or end of file "+
        "reached unexpectedly because the file has an invalid format.");

    int indexOfStartOfItem=indexInInputFile;
    skipToWhiteSpaceOrEndOfLineOrEndOfFile();
    return inputFile.substring(indexOfStartOfItem, indexInInputFile);
  }

  /**
   * Move on the current position to the end of the line or the file, or the
   * first white space character encountered.
   */
  private void skipToWhiteSpaceOrEndOfLineOrEndOfFile() {
    while (indexInInputFile<inputFile.length()
           && inputFile.charAt(indexInInputFile)!=' '
           && inputFile.charAt(indexInInputFile)!='\t'
           && inputFile.charAt(indexInInputFile)!='\n')
      ++indexInInputFile;
  }

  /**
   * Try to read a feature value pair.
   *
   * @return The feature value pair or null if no feature value pair was read
   * @throws GateRuntimeException if there's something other than a valid
   * feature-value pair or just white space at the current position in the line.
   */
  public FeatureValuePair readFeatureValuePair()
      throws gate.util.GateRuntimeException {
    skipWhiteSpace();
    // If there's nothing more to be read on the current line just return
    // signalling that.
    if (endOfFileReached() || inputFile.charAt(indexInInputFile) == '\n')
      return null;

    // Now we're committed to reading a feature-value pair, so if we can't it's
    // an error.
    FeatureValuePair featureValuePair = new FeatureValuePair();
    // Read the feature number, which starts at the current location, and
    // finishes just before the colon.
    int indexOfStartOfFeatureNumber = indexInInputFile;
    skipToColon();
    featureValuePair.featureNumber = Integer.parseInt(
        inputFile.substring(indexOfStartOfFeatureNumber, indexInInputFile));
    // Move on to the first character after the colon.
    ++indexInInputFile;
    if (inputFile.charAt(indexInInputFile) == ' '
        || inputFile.charAt(indexInInputFile) == '\t'
        || inputFile.charAt(indexInInputFile) == '\n')
      throw new gate.util.GateRuntimeException(
          "Error when reading from file at"
          + " character index " + indexInInputFile +
          " due to the file having an \n" +
          "invalid format. There must always be a \nfeature"
          + " value immediately after a colon in a feature value pair.");

    // Now read the feature value.
    int indexOfStartOfFeatureValue = indexInInputFile;
    skipToWhiteSpaceOrEndOfLineOrEndOfFile();
    featureValuePair.featureValue = Double.parseDouble(
        inputFile.substring(indexOfStartOfFeatureValue, indexInInputFile));

    return featureValuePair;
  }

  /**
   * Move on the current position from the start of a feature value pair to the
   * colon in a feature value pair.
   */
  private void skipToColon() {
    while (inputFile.charAt(indexInInputFile) != ':') {
      // If we find white space, or the end of the file or line before we find
      // a colon, there must be an error in the format of the input file.
      if (inputFile.charAt(indexInInputFile) == ' '
          || inputFile.charAt(indexInInputFile) == '\t'
          || inputFile.charAt(indexInInputFile) == '\n'
          || indexInInputFile == inputFile.length() - 1)
        throw new gate.util.GateRuntimeException(
            "Error when reading from file at character index " +
            indexInInputFile +
            " due \nto file having an invalid format. A colon (:) was not found " +
            "in a feature-value pair.");

      // So long as we've not found white space, just move on the index.
      ++indexInInputFile;
    }
  }

}