GATE.ac.uk - gate/plugins/Tagger_NP_Chunking/src/mark/chunking/Rule.java

/************************************************************************
 *         Copyright (C) 2004-2009 The University of Sheffield          *
 *       Developed by Mark Greenwood <m.greenwood@dcs.shef.ac.uk>       *
 *                                                                      *
 * This program is free software; you can redistribute it and/or modify *
 * it under the terms of the GNU Lesser General Public License as       *
 * published by the Free Software Foundation; either version 2.1 of the *
 * License, or (at your option) any later version.                      *
 *                                                                      *
 * This program is distributed in the hope that it will be useful,      *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of       *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
 * GNU General Public License for more details.                         *
 *                                                                      *
 * You should have received a copy of the GNU Lesser General Public     *
 * License along with this program; if not, write to the Free Software  *
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.            *
 ************************************************************************/

package mark.chunking;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This class encapulates chunking rules, providing methods to
 * see if it matches against text and to see if it fits within
 * a sentence.
 **/
public class Rule
{
	/**
	 * A Pattern which will split the rule name into type and offsets.
	 **/
	private static final Pattern pt = Pattern.compile("(T|W|P)(_?[0-3])+");

	/**
	 * A Pattern which will split up a sequence of offsets.
	 **/
	private static final Pattern po = Pattern.compile("_?[0-3]");

	/**
	 * The new chunk tag that is assigned if this rule matches the
	 * input text.
	 **/
	private String outTag = null;

	/**
	 * The full line of the rules file which created this rule,
	 * used mainly for the toString() method.
	 **/
	private String rule = null;

	/**
	 * A List to hold the types (T, W or P)
	 * of the parts of the rule.
	 **/
	private List<String> types = new ArrayList<String>();

	/**
	 * A List to hold the offsets for the
	 * parts of the rule.
	 **/
	private List<List<Integer>> offsets = new ArrayList<List<Integer>>();

	/**
	 * A List to hold the values for the
	 * parts of the rule.
	 **/
	private List<String> values = new ArrayList<String>();

	/**
	 * The smallest offset used within this rule.
	 **/
	private int begin = Integer.MAX_VALUE;

	/**
	 * The largest offset used within this rule.
	 **/
	private int end = Integer.MIN_VALUE;

	public Rule(String rule)
	{
		//store a copy of the defining text
		this.rule = rule;

		//split the rule into pieces at the spaces
		String[] parts = rule.split(" ");

		//store the last part of the rule as the out tag
		outTag = parts[parts.length-1];

		//Use the Pattern to split the template type
		//into the different token/offsets
		Matcher mt = pt.matcher(parts[0]);

		//We know that the first value is at position 1
		//in the split array
		int index = 1;

		while (mt.find())
		{
			//while there are still parts to process,
			//get the next one
			String to = mt.group();

			//store the type of this part
			types.add(to.substring(0,1));

			//create a new list to hold the offsets
			//for this part
			List<Integer> ofs = new ArrayList<Integer>();

			//split the offsets into separate parts
			Matcher mo = po.matcher(to.substring(1));

			//store the value associated with this part of the rule
			values.add(parts[index++]);

			while (mo.find())
			{
				//while there are more offsets,

				//get the next one and make an Integer from it
				//(we have to replace '_' by '-' first for it to work)
				Integer offset = new Integer(mo.group().replaceAll("_","-"));

				//if the current offset is before the known beginning then
				//make this the beginning
				if (offset.intValue() < begin) begin = offset.intValue();

				//if the current offset is after the known ending then
				//make this offset the end
				if (offset.intValue() > end) end = offset.intValue();

				//store the offset in the list
				ofs.add(offset);
			}

			//store the list of offsets for this part
			offsets.add(ofs);
		}
	}

	/**
	 * Simply returns true if this rule matches against the sentence at
	 * a given position. This method makes no alterations to the
	 * tags assigned to any specific offset.
	 * @param currentToken the index within the sentece of the token
	 *                     upon which the rule is centered.
	 * @param words an ordered List of the words within the sentence.
	 * @param tags an ordered List of the chunk tags within the sentence.
	 * @param pos an ordered List of the POS tags within the sentence.
	 * @return true if the rule matches the input sentence, false otherwise.
	 **/
	public boolean match(int currentToken, List<String> words, List<String> tags, List<String> pos)
	{
		//if the rule doesn't fit within the sentence then it can never
		//match so simply return false
		if (!withinSentence(words.size(), currentToken)) return false;

		//assume the rule will match
		boolean matched = true;

		//loop through all the parts of this rule
		for (int i = 0 ; i < types.size() ; ++i)
		{
			//get the current type
			String type = types.get(i);

			//get the list of offsets for the part
			List<Integer> ofs = offsets.get(i);

			//get the value for this part
			String value = values.get(i);

			//A placeholder for the right list
			List<String> working = null;

			if (type.equals("T"))
			{
				//if the type is "T" then the list we
				//are going to work on contains chunk tags
				working = tags;
			}
			else if (type.equals("W"))
			{
				//if the type is "W" then the list we
				//are going to work on contains words
				working = words;
			}
			else if (type.equals("P"))
			{
				//if the type is "P" then the list we
				//are going to work on contains POS tags
				working = pos;
			}

			//get the first (maybe the only) offset for this part
			int offset = ofs.get(0).intValue();

			//does the value of this offset match the value given in the rule
			boolean matchOffset = working.get(currentToken+offset).equals(value);

			for (int j = 1 ; j < ofs.size() ; ++j)
			{
				//if there is more than one offset then...

				//get the next offset
				offset = ofs.get(j).intValue();

				//or the truth of matching the value in the rule against
				//the value of the offset
				matchOffset = matchOffset || working.get(currentToken+offset).equals(value);
			}

			//combine the success/failure of matching this part with that
			//of matching the rest of the rule
			matched = matched && matchOffset;

			//if we have failed to match there is no point trying
			//to match the rest of the rule so jump out of this loop
			if (!matched) i = types.size();
		}

		//return the result of matching we have found
		return matched;
	}

	/**
	 * Simply returns the new chunk tag to use if this rule matched.
	 * @return the new chunk tag.
	 **/
	public String getNewTag()
	{
		//simply return the out tag
		return outTag;
	}

	/**
	 * A method which allows you to check that this rule fits within
	 * the sentence when centered on a specific token.
	 * @param numTokens the total number of tokens in the sentence.
	 * @param currentToken the index of the token upon which the
	 *        rule is going to be centered.
	 * @return true if the rule fits within the sentence, false otherwise.
	 **/
	public boolean withinSentence(int numTokens, int currentToken)
	{
		int start = currentToken + begin;
		int finish = currentToken + end;

		boolean within = (start >= 0 && start < numTokens);

		within = within && (finish >= 0 && finish < numTokens);

		return within;
	}

	@Override public String toString()
	{
		//simply return the line of the rules file
		return rule;
	}
}