GATE.ac.uk - gate/plugins/Gazetteer_LKB/src/com/ontotext/kim/model/ParsingFrame.java

package com.ontotext.kim.model;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.collections.Transformer;
import org.apache.commons.collections.TransformerUtils;

import com.ontotext.kim.model.AliasCacheImpl.Stats;


/**
 * This class is used as a text parsing tool by classes
 * <code>AliasCacheImpl</code> and <code>KimLookupParser</code> and produces
 * special output for the class <code>HashedAlias</code>.<br>
 * The class provides means to parse a string into sequence of plain numeric
 * and letter lexemes delimited by optional non-alpha-numeric character
 * sequences of arbitrary length. This is the base for creation of a Hashed
 * Alias and also the base for searching it in a text.<br>
 * The class offers a frame view over the parsed string. The frame spans one or
 * more sequential alpha-numeric lexemes (ANL) and their non-alpha-numeric
 * context (surrounding puctuation and white spaces).<br>
 * The frame contains three parts:<br>
 *  * middle - this is a part of the initial string which starts in the 
 * beginning of the first ANL of the frame and ends in the end of the last ANL
 * of the frame.<br>
 *  * prefix - this is the non-alpha-numeric sequence before the first ANL of
 * the frame.<br>
 *  * suffix - this is the non-alpha-numeric sequence after the last ANL of the
 * frame.<br>
 * The class has couple of methods for manipulating the frame. The first one
 * is for extending the frame by one lexeme. The second one is for moving
 * the frame start by one lexeme, which resets the frame length to one
 * lexeme.<br>
 * The class has retrieval methods that get the current frame offset and
 * the lengths of the three parts. There are dedicated methods for calculation
 * of the two Hash Codes specific to a Hashed Alias.
 * 
 * @author danko
 *
 */
public class ParsingFrame {
	
	public static final boolean SPLIT_AT_SMALL_TO_CAPITAL_CASE_CHANGE = false;
	//================================================
	// ParsingFrame: Lexeme parsing regular expression
	//================================================
	/** This is the Reg-Ex pattern that is used search the ANLs */
	private static final Pattern LEXEME_MATCH_PAT = Pattern.compile(
			"(" +
			// Number only lexemes
			"(?:\\d)+|" +
			
			(SPLIT_AT_SMALL_TO_CAPITAL_CASE_CHANGE 
					? "(?:\\p{javaUpperCase}{0,}\\p{javaLowerCase}{1,})|(?:\\p{javaUpperCase}{1,})"			    
					: "(?:[\\p{javaUpperCase}\\p{javaLowerCase}]+)" ) + 
			")"
			// Non-Alpha-Numeric suffix
			+ "([^\\d\\p{javaUpperCase}\\p{javaLowerCase}]*)"
	);
	/** This is the delimiting character used for the normalized form
	 *  of the parsed string. The normalized for is used to calculate
	 *  the H1 hash value of the <code>HashAlias</code> */
	private static final String H1_DELIMITER = "^";

	public static Transformer frameTT = TransformerUtils.nopTransformer();

	//================================================
	// ParsingFrame: Parse result buffering
	//================================================
	/** This class represents a single alpha-numeric lexeme (ANL) and its
	 * non-alpha-numeric context. */
	private static class ParsingBufferElement {
		public final int pref;
		public final int midd;
		public final int suff;
		public final String middTxt;
		public final int offset;
		public ParsingBufferElement(int pref, int midd, int suff,
				String middTxt, int offset) {
			this.pref = pref;  this.midd = midd;
			this.suff = suff;  this.middTxt = middTxt; this.offset = offset;
		}
	}
	private final static ParsingBufferElement EMPTY_ELEMENT =
		new ParsingBufferElement(0,0,0,"",0);

	/** This is the parsing buffer that stores the results from parsing of 
	 * the input text. The result is stored as a sequence of ANL elements
	 * with the sizes of their non-alpha-numeric prefix and suffix. */
	private ArrayList<ParsingBufferElement> parsingBuffer =
		new ArrayList<ParsingBufferElement>();

	//================================================
	// ParsingFrame: The definition data of the frame
	//================================================
	private int parsingIx = -1;
	private int frameIx = 0;
	private int restartIx = 0;
	private boolean frameCanExpand = true;

	private int prefix = 0;
	private int middle = 0;
	private int suffix = 0;
	private final String source;
	private String normalizedAlias = "";
	private int lexemeCount = 0;

	//================================================
	// ParsingFrame: class constructor
	//================================================
	/** The constructor parses the source string with the regular expression
	 * based <code>java.util.regex.Matcher</code> and stores the results in
	 * the parsing buffer <code>parsingBuffer</code>.
	 * @param source - the String to be processed through the
	 * <code>ParsingFrame</code> class
	 */
	public ParsingFrame (String source) {
		Stats.markIt(-1);
		this.source = source;
		Matcher lexMatcher = LEXEME_MATCH_PAT.matcher(source);
		int lastSuff = -1;
		while (lexMatcher.find()) {
			if (lastSuff < 0)
				lastSuff = lexMatcher.start();
			parsingBuffer.add( new ParsingBufferElement(
					lastSuff,
					lexMatcher.end(1) - lexMatcher.start(1),
					lexMatcher.end(2) - lexMatcher.start(2),
					lexMatcher.group(1),
					lexMatcher.start() - lastSuff));
			lastSuff = lexMatcher.end(2) - lexMatcher.start(2);
		}

		Stats.markIt(7);
	}

	//================================================
	// ParsingFrame: LookUp and Annotation data
	//================================================
	private int aliasHash1 = 0;
	private int aliasHash2 = 0;
	private int aliasOffset1 = 0;
	private int aliasOffset2 = 0;
	private int oldPref=-1;
	private int oldSuff=-1;
	private void resetLAD() {
		aliasHash1 = 0;
		aliasHash2 = 0;
		oldPref=-1;
		oldSuff=-1;
		this.setNewPrefSufLen(prefix, suffix);
	}
	/** This method changes the effective lengths of the non-alpha-numeric
	 * prefix and suffix. This change affects the results of methods:<br>
	 * <code>getAliasHash2</code>, <code>getLength</code>,
	 * <code>getAliasStart</code> and <code>getAliasEnd</code>.<br>
	 * The last values of the prefix and suffix are stored so if called
	 * again with the same values - no recalculations will be performed. 
	 * @param prefLen - the new effective prefix length.
	 * @param suffLen - the new effective suffix length.
	 */
	public void setNewPrefSufLen(int prefLen, int suffLen) {
		if (oldPref!=prefLen) {
			prefLen = Math.min(prefLen, prefix);
			int offset = (parsingBuffer.size()>0)?
					parsingBuffer.get(frameIx).offset:
						0;
					aliasOffset1 = offset + prefix - prefLen;
		}
		if (oldPref!=prefLen || oldSuff!=suffLen) {
			suffLen = Math.min(suffLen, suffix);
			aliasOffset2 = aliasOffset1 + prefLen + middle + suffLen;

			aliasHash2 = 0;  // Recalculation will be done when and if required
		}
		oldPref=prefLen;
		oldSuff=suffLen;
	}

	//================================================
	// ParsingFrame: Frame manipulation methods
	//================================================
	/** Method attempts to extend the frame to cover one more parsing-buffer
	 * element. If successful - recalculates the frame state metrics.  
	 * @return <b>true</b> if the content of the frame was extended with
	 * one more buffer element
	 */
	public boolean parseOne() {
		if (!frameCanMove())  return false;

		Stats.markIt(-1);
		ParsingBufferElement pbe = EMPTY_ELEMENT;
		boolean oneParsed = false;
		if (restartIx >= 0) {
			frameIx = restartIx;
			lexemeCount = 0;
			oneParsed = (restartIx < parsingBuffer.size());
			if (oneParsed)
				parsingIx = restartIx;
		}
		else {
			oneParsed = ((parsingIx+1) < parsingBuffer.size());
			if (oneParsed)
				parsingIx++;
		}

		if (oneParsed) {
			pbe = parsingBuffer.get(parsingIx);
			lexemeCount++;
		}
		Stats.markIt(8);

		Stats.markIt(-1);
		if (restartIx >= 0) {
			prefix = pbe.pref;
			middle = pbe.midd;
			suffix = pbe.suff;
			normalizedAlias = H1_DELIMITER + pbe.middTxt + H1_DELIMITER;
		}
		else {
			middle += suffix + pbe.midd;
			suffix = pbe.suff;
			normalizedAlias += pbe.middTxt + H1_DELIMITER;
		}

		// Allows for preliminary detection of parse ending
		frameCanExpand = oneParsed && ((parsingIx+1) < parsingBuffer.size());
		restartIx = -1;

		resetLAD();
		Stats.markIt(9);

		return oneParsed;
	}

	/** A shortcut method that extends the frame to cover the whole input
	 * string */
	public void parseAll() {
		while (frameCanExpand)  parseOne();
	}

	/** The method moves the start of the frame - one buffer element forward
	 * and shrinks the size of the frame to one element. */
	public void moveOne() {
		if (restartIx >= 0)  return;
		// Calculate the offset for new parsing start
		restartIx = frameIx + 1;

		// Conditions under which the frame-movement attempt fails
		if (restartIx >= parsingBuffer.size())
			restartIx = -1;
	}

	//==================================================
	// ParsingFrame: Extraction of frame dynamic flags
	//==================================================
	public boolean frameCanExpand() { return frameCanExpand; }
	public boolean frameCanMove() { return frameCanExpand || restartIx >= 0; }

	//==================================================
	// ParsingFrame: Extraction of frame state metrics
	//==================================================
	/** Retrieves the <code>HashedAlias</code> related Hash-Code-1. It is 
	 * calculated over the normalized form of the underlying text. A text
	 * transformation with externally provided transformation logic is 
	 * performed prior to hash-code calculation.
	 * @return - the value of the hash-code.
	 */
	public int getAliasHash1() {
		checkValid();
		if (aliasHash1==0)
			aliasHash1 = frameTT.transform(normalizedAlias).hashCode(); 
		return aliasHash1;
	}
	/** Retrieves the <code>HashedAlias</code> related Hash-Code-2. It is 
	 * calculated over the plain underlying text. A text
	 * transformation with externally provided transformation logic is 
	 * performed prior to hash-code calculation.
	 * @return - the value of the hash-code.
	 */
	public int getAliasHash2() {
		checkValid();
		if (aliasHash2==0)
			aliasHash2 = frameTT.transform(source.substring(
					aliasOffset1, aliasOffset2)).hashCode();
		return aliasHash2;
	}
	/** Retrieves the length of the non-alpha-numeric prefix of the frame
	 * @return non-alpha-numeric prefix length
	 */
	public int getPrefixLen() { checkValid(); return prefix; }
	/** Retrieves the length of the middle part of the frame
	 * @return middle part length
	 */
	public int getMiddleLen() { checkValid(); return middle; }
	/** Retrieves the length of the non-alpha-numeric suffix of the frame
	 * @return non-alpha-numeric suffix length
	 */
	public int getSuffixLen() { checkValid(); return suffix; }
	/** Retrieves the offset in the original input text of the frame start
	 * @return frame start offset
	 */
	public int getAliasStart() { checkValid(); return aliasOffset1; }
	/** Retrieves the offset in the original input text of the frame end
	 * @return frame end offset
	 */
	public int getAliasEnd() { checkValid(); return aliasOffset2; }
	/** Retrieves the length of the text corresponding to the current frame
	 * @return frame text length
	 */
	public int getLength() { checkValid(); return aliasOffset2-aliasOffset1; }
	/** Retrieves the count of ANLs in the frame
	 * @return ANL count
	 */
	public int getLexemeCount() { checkValid(); return lexemeCount; }

	private void checkValid() {
		if (restartIx >= 0)
			throw new RuntimeException("No parsed content in the Frame!");
	}

	//========================================
	// Methods overridden for Testing purposes
	//========================================
	@Override
	public boolean equals(Object o) {
		if (o instanceof ParsingFrame) {
			ParsingFrame pf = (ParsingFrame) o;
			return (pf.parsingBuffer.get(pf.frameIx).offset ==
				this.parsingBuffer.get(this.frameIx).offset &&
				pf.lexemeCount == this.lexemeCount &&
				pf.getAliasHash1() == this.getAliasHash1() );
		}
		return false;
	}

	@Override
	public int hashCode() {
		return 1;
	}
	
	@Override
	public String toString() {
		return this.normalizedAlias +
		"(" + parsingBuffer.get(frameIx).offset + "/" +
		source.length() + ")" +
		prefix + "-" + middle + "-" + suffix +
		((frameCanExpand)? " CAN-expand": " NO-expand") + " " +
		((restartIx>=0)? ("RELOCATE="+restartIx): "");
	}
}