GATE.ac.uk - gate/plugins/Tagger_NP_Chunking/src/mark/chunking/Chunker.java

/************************************************************************
 *         Copyright (C) 2004-2009 The University of Sheffield          *
 *       Developed by Mark Greenwood <m.greenwood@dcs.shef.ac.uk>       *
 *                                                                      *
 * This program is free software; you can redistribute it and/or modify *
 * it under the terms of the GNU Lesser General Public License as       *
 * published by the Free Software Foundation; either version 2.1 of the *
 * License, or (at your option) any later version.                      *
 *                                                                      *
 * This program is distributed in the hope that it will be useful,      *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of       *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
 * GNU General Public License for more details.                         *
 *                                                                      *
 * You should have received a copy of the GNU Lesser General Public     *
 * License along with this program; if not, write to the Free Software  *
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.            *
 ************************************************************************/

package mark.chunking;

import gate.util.BomStrippingInputStreamReader;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

public class Chunker
{
	private List<Rule> rules = new ArrayList<Rule>();

	public static void main(String args[]) throws Exception
	{
		Chunker c = new Chunker((new File(args[0])).toURI().toURL());

		BufferedReader in = new BufferedReader(new FileReader(args[1]));

		String line = in.readLine();

		Map<String,String> chunkTags = new HashMap<String,String>();

		while (line != null)
		{
			if (!line.trim().equals(""))
			{
				String[] tags = line.split(" ");
				chunkTags.put(tags[0],tags[1]);
			}

			line = in.readLine();
		}

		in.close();

		in = new BomStrippingInputStreamReader(System.in);

		line = in.readLine();

		while (line != null)
		{
			String[] tokens = line.split(" ");

			List<String> wl = new ArrayList<String>();
			List<String> tl = new ArrayList<String>();
			List<String> pl = new ArrayList<String>();

			for (int i = 0 ; i < tokens.length ; ++i)
			{
				String[] data = tokens[i].split("/");

				wl.add(data[0]);
				pl.add(data[1]);

				String ct = chunkTags.get(data[1]);

				if (ct == null) ct = "I";

				tl.add(ct);
			}

			tl = c.chunkSentence(wl,tl,pl);

			boolean inBaseNP = false;
			boolean lineBegin = true;

			for (int i = 0 ; i < wl.size() ; ++i)
			{
				String ct = tl.get(i);

				if (inBaseNP)
				{
					if (ct.equals("B"))
					{
						System.out.print(" ] [");
					}
					else if (ct.equals("O"))
					{
						System.out.print(" ]");
						inBaseNP = false;
					}
				}
				else
				{
					if (ct.equals("B") || ct.equals("I"))
					{
						if (!lineBegin) System.out.print(" ");
						lineBegin = false;
						System.out.print("[");
						inBaseNP = true;
					}
				}
				if (!lineBegin) System.out.print(" ");
				lineBegin = false;
				System.out.print(wl.get(i) + "/" + pl.get(i));
			}

			if (inBaseNP)
			{
				System.out.print("]");
			}

			System.out.println();

			line = in.readLine();
		}
	}

	/**
	 * The only constructor that reads the rules from a URL.
	 * @param u the URL of the rules file.
	 **/
	public Chunker(URL u) throws IOException
	{
		//Open up the rules file read for reading
		BufferedReader in = new BomStrippingInputStreamReader(u.openStream());

		//read in the first rule from the file
		String rule = in.readLine();

		while (rule != null)
		{
			//while there are still rules to process...

			if (!rule.trim().equals(""))
			{
				//create and add a rule to the list of rules
				rules.add(new Rule(rule));
			}

			//read in the next rule;
			rule = in.readLine();
		}
	}

	/**
	 * This is the method which does all the work and returns
	 * an updated set of chunk tags.
	 * @param words an ordered List of the words within the sentence.
	 * @param tags an ordered List of the chunk tags within the sentence.
	 * @param pos an ordered List of the POS tags within the sentence.
	 * @return an ordered List of the updated chunk tags for the sentence.
	 **/
	public List<String> chunkSentence(List<String> words, List<String> tags, List<String> pos)
	{
		//add the word/pos/tag that represents the end of
		//the sentence, cos some of the rules match against
		//the end of the sentence
		words.add("ZZZ");
		pos.add("ZZZ");
		tags.add("Z");

		//Get an iterator over the rules and loop
		//through them...
		Iterator<Rule> it = rules.iterator();
		while (it.hasNext())
		{
			//create an empty list to hold the new
			//chunk tags for this iterations
			List<String> newTags = new ArrayList<String>();

			//get the next rule we are going to apply
			Rule r = it.next();

			//loop over all the words in the sentence
			for (int i = 0 ; i < words.size() ; ++i)
			{
				if (r.match(i,words,tags,pos))
				{
					//if the rule matches against the current
					//word in the sentence then and the new tag
					//from the rule to the new tag list
					newTags.add(r.getNewTag());
				}
				else
				{
					//the rule didn't match so simply copy the
					//chunk tag that was already assigned
					newTags.add(tags.get(i));
				}
			}

			//now replace the old tags with the new ones ready
			//for running the next rule, this stops rule-chaining
			tags = newTags;
		}

		//remove the last token from each list as these
		//are not part of the original input sentence
		words.remove(words.size()-1);
		pos.remove(pos.size()-1);
		tags.remove(tags.size()-1);

		//return the final updated chunk tag lists
		return tags;
	}
}