/*
* Copyright (c) 2004, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Mike Dowman 08-04-2004
*
* $Id: MyStringReader.java 7452 2006-06-15 14:45:17 +0000 (Thu, 15 Jun 2006) ian_roberts $
*
*/
package gate.creole.ml.svmlight;
class MyStringReader {
static final boolean DEBUG=false;
String inputFile;
int indexInInputFile;
/**
* Initialises the string reader by reading a whole file and putting it in a
* string for easy access. Initialises the index to point at the beginning of
* the string.
*
* @param reader The file reader to read the file from.
* @throws IOException if there's an error reading the file.
*/
public MyStringReader(java.io.FileReader reader) throws java.io.IOException {
StringBuffer wholeFile=new StringBuffer();
while (reader.ready()) {
wholeFile.append((char)reader.read());
}
inputFile=wholeFile.toString();
indexInInputFile=0;
if (DEBUG) {
System.out.println("A MyStringReader has been initialised with the "+
"following contents:\n"+inputFile);
}
}
/**
* Skip over any lines which are just comments.
*/
public void skipLeadingComments() {
// Keep skipping lines till we find one on which the first white space
// character is not the comment symbol (#).
while (true) {
skipWhiteSpace();
if (indexInInputFile<inputFile.length()
&& inputFile.charAt(indexInInputFile) == '#')
skipToStartOfNextLine();
else
return;
}
}
/**
* Skip over any white space (spaces or tabs).
*/
public void skipWhiteSpace() {
while (indexInInputFile<inputFile.length()
&& (inputFile.charAt(indexInInputFile)==' '
|| inputFile.charAt(indexInInputFile)=='\t')) {
++indexInInputFile;
}
}
/**
* Skip over any white space, including new lines.
*/
public void skipBlankLinesAndWhiteSpace() {
while (indexInInputFile<inputFile.length()
&& (inputFile.charAt(indexInInputFile)==' '
|| inputFile.charAt(indexInInputFile)=='\t'
|| inputFile.charAt(indexInInputFile)=='\n')) {
++indexInInputFile;
}
}
/**
* Move on to the first character after the end of the line (or possibly after
* the end of the file).
*/
public void skipToStartOfNextLine() {
int indexOfStartOfNextLine=inputFile.indexOf('\n', indexInInputFile)+1;
// If there are no more lines in the file, just point to the first character
// after the end of the file.
if (indexOfStartOfNextLine==-1)
indexInInputFile=inputFile.length();
else
indexInInputFile=indexOfStartOfNextLine;
}
/**
* See if we've reached the end of the file.
* @return true if we've got to the end of the file, false otherwise.
*/
public boolean endOfFileReached() {
return indexInInputFile==inputFile.length();
}
/**
* Read a string from the file, starting at the first non-white space
* character at or after the current position, and ending at the last
* non-white space character after that.
*/
public String readItem() {
skipWhiteSpace();
// First make sure that there is an item to read.
if (indexInInputFile==inputFile.length()
|| inputFile.charAt(indexInInputFile)=='\n')
throw new gate.util.GateRuntimeException("Error when reading from file"+
" at character index "+indexInInputFile+". \nEnd of line or end of file "+
"reached unexpectedly because the file has an invalid format.");
int indexOfStartOfItem=indexInInputFile;
skipToWhiteSpaceOrEndOfLineOrEndOfFile();
return inputFile.substring(indexOfStartOfItem, indexInInputFile);
}
/**
* Move on the current position to the end of the line or the file, or the
* first white space character encountered.
*/
private void skipToWhiteSpaceOrEndOfLineOrEndOfFile() {
while (indexInInputFile<inputFile.length()
&& inputFile.charAt(indexInInputFile)!=' '
&& inputFile.charAt(indexInInputFile)!='\t'
&& inputFile.charAt(indexInInputFile)!='\n')
++indexInInputFile;
}
/**
* Try to read a feature value pair.
*
* @return The feature value pair or null if no feature value pair was read
* @throws GateRuntimeException if there's something other than a valid
* feature-value pair or just white space at the current position in the line.
*/
public FeatureValuePair readFeatureValuePair()
throws gate.util.GateRuntimeException {
skipWhiteSpace();
// If there's nothing more to be read on the current line just return
// signalling that.
if (endOfFileReached() || inputFile.charAt(indexInInputFile) == '\n')
return null;
// Now we're committed to reading a feature-value pair, so if we can't it's
// an error.
FeatureValuePair featureValuePair = new FeatureValuePair();
// Read the feature number, which starts at the current location, and
// finishes just before the colon.
int indexOfStartOfFeatureNumber = indexInInputFile;
skipToColon();
featureValuePair.featureNumber = Integer.parseInt(
inputFile.substring(indexOfStartOfFeatureNumber, indexInInputFile));
// Move on to the first character after the colon.
++indexInInputFile;
if (inputFile.charAt(indexInInputFile) == ' '
|| inputFile.charAt(indexInInputFile) == '\t'
|| inputFile.charAt(indexInInputFile) == '\n')
throw new gate.util.GateRuntimeException(
"Error when reading from file at"
+ " character index " + indexInInputFile +
" due to the file having an \n" +
"invalid format. There must always be a \nfeature"
+ " value immediately after a colon in a feature value pair.");
// Now read the feature value.
int indexOfStartOfFeatureValue = indexInInputFile;
skipToWhiteSpaceOrEndOfLineOrEndOfFile();
featureValuePair.featureValue = Double.parseDouble(
inputFile.substring(indexOfStartOfFeatureValue, indexInInputFile));
return featureValuePair;
}
/**
* Move on the current position from the start of a feature value pair to the
* colon in a feature value pair.
*/
private void skipToColon() {
while (inputFile.charAt(indexInInputFile) != ':') {
// If we find white space, or the end of the file or line before we find
// a colon, there must be an error in the format of the input file.
if (inputFile.charAt(indexInInputFile) == ' '
|| inputFile.charAt(indexInInputFile) == '\t'
|| inputFile.charAt(indexInInputFile) == '\n'
|| indexInInputFile == inputFile.length() - 1)
throw new gate.util.GateRuntimeException(
"Error when reading from file at character index " +
indexInInputFile +
" due \nto file having an invalid format. A colon (:) was not found " +
"in a feature-value pair.");
// So long as we've not found white space, just move on the index.
++indexInInputFile;
}
}
}