GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Parser

/*
 *  Minipar.java
 *
 *  Copyright (c) 1998-2004, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Niraj Aswani
 *
 *  $Id: Minipar.java 10104 2008-12-23 11:48:42Z ian_roberts $
 */

package minipar;

import java.io.*;
import java.util.*;
import java.net.*;

import gate.*;
import gate.creole.*;
import gate.util.*;
import gate.event.*;
import gate.gui.MainFrame;

/**
 * This class is the implementation of the resource Minipar
 */
public class Minipar extends AbstractLanguageAnalyser implements
		ProcessingResource {

	/**
	 * Name of the temporary file, which is populated with the text of GATE
	 * document in order to process it with Minipar
	 */
	public static final String GATETEXTFILE = "GATESentences";
  
  /**
   * The Minipar executable limits the length of a sentence to 1024 characters.
   * If a sentence is longer than the maximal length it is not sent to MINIPAR7
   */
  public static final long maxSentenceLength = 1024;

	/**
	 * URL of the minipar Data Directory that contains lexicons etc.
	 */
	private URL miniparDataDir;

	/**
	 * URL of the minipar Binary file, it can be either exe or linux,
	 * distributed along with Minipar in GATE.
	 */
	private URL miniparBinary;

	/**
	 * Minipar creates tokens. This variable provides the type name for it.
	 */
	private String annotationTypeName;

	/**
	 * Name of the inputAnnotationSet
	 */
	private String annotationInputSetName;

	/**
	 * Name of the outputAnnotationSet
	 */
	private String annotationOutputSetName;

	/**
	 * Get the MiniparDataDir value.
	 */
	public URL getMiniparDataDir() {
		return miniparDataDir;
	}

	/**
	 * Get the AnnotationTypeName, new annotations are created with this name
	 */
	public String getAnnotationTypeName() {
		return this.annotationTypeName;
	}

	/**
	 * Set the AnnotationTypeName, new annotations are created with this name
	 */
	public void setAnnotationTypeName(String aTypeName) {
		this.annotationTypeName = aTypeName;
	}

	/**
	 * Get the AnnotationInputSetName, source of the annotations to be taken
	 * from and to work on
	 */
	public String getAnnotationInputSetName() {
		return this.annotationInputSetName;
	}

	/**
	 * Get the AnnotationOutputSetName, annotations to be created under this
	 * annotationSet
	 */
	public String getAnnotationOutputSetName() {
		return this.annotationOutputSetName;
	}

	/**
	 * Get the AnnotationInputSetName, source of the annotations to be taken
	 * from and to work on
	 */
	public void setAnnotationInputSetName(String inputSet) {
		this.annotationInputSetName = inputSet;
	}

	/**
	 * Get the AnnotationOutputSetName, annotations to be created under this
	 * annotationSet
	 */
	public void setAnnotationOutputSetName(String outputSet) {
		this.annotationOutputSetName = outputSet;
	}

	/**
	 * Set the MiniparDataDirectory.. This is the directory that Minipar uses to
	 * collect the data for its internal processing. Default location is
	 * minipar_home/data
	 */
	public void setMiniparDataDir(URL newMiniparDataDir) {
		this.miniparDataDir = newMiniparDataDir;
	}

	/**
	 * This is the url of MiniparBinary It should be somewhere located on the
	 * drive where the user has execution rights
	 */
	public URL getMiniparBinary() {
		return miniparBinary;
	}

	/**
	 * This is the url of MiniparBinary It should be somewhere located on the
	 * drive where the user has execution rights
	 */
	public void setMiniparBinary(URL newMiniparBinary) {
		this.miniparBinary = newMiniparBinary;
	}

	/**
	 * Init method is called when the resource is instantiated in GATE. This
	 * checks for the supported operating systems and the mandotary init-time
	 * parameters.
	 */
	public Resource init() throws ResourceInstantiationException {
		// we need to check the operating system
		// And to detect the underlying Operating system
		String osName = System.getProperty("os.name").toLowerCase();
		// Detecting Linux and Windows
		if (osName.toLowerCase().indexOf("linux") == -1
				&& osName.toLowerCase().indexOf("windows") == -1) {
			throw new ResourceInstantiationException(
					"This PR can only be instantiated on Windows/Linux Machine");
		}
		return super.init();
	}

	/**
	 * Minipar Binary file takes a file as an argument, which has one sentence
	 * written on one line. It takes one sentence at a time and parses them one
	 * by one.
	 * 
	 * @return The list containing annotations of type *Sentence*
	 * @throws ExecutionException
	 */
	private ArrayList saveGateSentences(File gateTextFile) throws ExecutionException {

		AnnotationSet allAnnotations;

		// get sentences from document
		allAnnotations = (annotationInputSetName == null || annotationInputSetName
				.equals("")) ? document.getAnnotations() : document
				.getAnnotations(annotationInputSetName);

		if (allAnnotations == null || allAnnotations.size() == 0) {
			throw new ExecutionException(
					"Document doesn't have sentence annotations. "
							+ "please run tokenizer, sentence splitter "
							+ "and then Minipar");
		}

		// Get all "sentences"
		List sentences = new ArrayList((Set) allAnnotations
				.get(SENTENCE_ANNOTATION_TYPE));

		// create an ArrayList to hold all sentences
		ArrayList allSentences = new ArrayList(sentences);

		// sort all sentences by start offset
		Collections.sort(allSentences, new gate.util.OffsetComparator());

    // only the sentences shorter than maxSentenceLength are sent to Minipar
    ArrayList sentencesSent =  new ArrayList(allSentences.size());
    
		// save sentence strings to file for Minipar
		ListIterator sentenceIterator = allSentences.listIterator();
		try {
			String sentenceString = null;
			FileWriter fw = new FileWriter(gateTextFile);
			PrintWriter pw = new PrintWriter(fw, true);
			while (sentenceIterator.hasNext()) {
				Annotation sentence = (Annotation) sentenceIterator.next();
				sentenceString = null;
        Long startOffset = sentence.getStartNode().getOffset();
        Long endOffset = sentence.getEndNode().getOffset();
        // check that the length is inferior to the limit
        long length = endOffset.longValue()-startOffset.longValue();
        if (length>=maxSentenceLength) continue;
				try {
					sentenceString = getDocument().getContent().getContent(
                  startOffset,endOffset).toString();
				} catch (InvalidOffsetException ioe) {
					throw new GateRuntimeException(
							"Invalid offset of the annotation");
				}
				sentenceString = sentenceString.replaceAll("\r", " ");
				sentenceString = sentenceString.replaceAll("\n", " ");
				pw.println(sentenceString);
        sentencesSent.add(sentence);
			}
			fw.close();
		} catch (java.io.IOException except) {
			throw new ExecutionException(except);
		}

		return sentencesSent;
	}

	/**
	 * Core of the wrapper. GATETEXTFILE is processed with Minipar. Minipar
	 * given a text file, binary executable and the location of data directory,
	 * returns a parse, which is then mapped over the GATE document.
	 * 
	 * @param allSentences -
	 *            GATE sentence annotations
	 * @throws ExecutionException
	 */
	private void runMinipar(ArrayList allSentences, File gateTextFile) throws ExecutionException {

		// this should be the miniparBinary + "-p " + getMiniparDataDir +
		// GATETEXTFILE
		File binary = Files.fileFromURL(getMiniparBinary());
		File dataFile = Files.fileFromURL(getMiniparDataDir());
		//String cmdline = binary.getAbsolutePath() + " -p "
		//		+ dataFile.getAbsolutePath() + " -file "
		//		+ gateTextFile.getAbsolutePath();
		String[] cmdline = new String[5];
		cmdline[0] = binary.getAbsolutePath();
		cmdline[1] = "-p";
		cmdline[2] = dataFile.getAbsolutePath();
		cmdline[3] = "-file";
		cmdline[4] = gateTextFile.getAbsolutePath();
		// run minipar and save output
		try {
			String line;
			Process p = Runtime.getRuntime().exec(cmdline);
			BufferedReader input = new BufferedReader(new InputStreamReader(p
					.getInputStream()));

			// this has ArrayList as its each element
			// this element consists of all annotations for that particular
			// sentence
			ArrayList sentenceTokens = new ArrayList();

			// this will have an annotation for each line begining with a number
			ArrayList tokens = new ArrayList();
			outer: while ((line = input.readLine()) != null) {
				WordToken wt = new WordToken();
				// so here whatever we get in line
				// is of our interest only if it begins with any number
				// each line is deliminated with a tab sign
				String[] output = line.split("\t");
				if (output.length < 5)
					continue;
				for (int i = 0; i < output.length; i++) {
					// we ignore case 2 and 3 and 6 and after.. because we don't
					// want
					// that information
					switch (i) {
					case 0:
						// this is a word number
						try {
							int number = Integer.parseInt(output[i].trim());
							// yes this is correct line
							// we need to check if the line number is 1
							// it may be the begining of new sentence
							if (number == 1 && tokens.size() > 0) {
								// we need to add tokens to the sentenceTokens
								sentenceTokens.add(tokens);
								tokens = new ArrayList();
							}
						} catch (NumberFormatException infe) {
							// if we are here, there is something wrong with
							// number
							// ignore this line and continue with next line
							continue outer;
						}
						break;
					case 1:
						// this is the actual word (Token.string)
						wt.word = output[i];
						break;
					case 4:
						// this should be the number and if it is not
						// then we leave it and do not add any head
						try {
							int head = Integer.parseInt(output[i].trim());
							// yes this is the correct head number
							wt.headNumber = head;
						} catch (NumberFormatException nfe) {
							// if we are here, there is something wrong with
							// number
							// ignore this and make headNumber -1 letter on to
							// remember that we don't want headnumber to be
							// inserted as a
							// feature
							wt.headNumber = -1;
						}
						break;
					case 5:
						// this is the relation between head and the current
						// node
						wt.relationWithHead = output[i];
						break;
					default:
						break;
					}
				}

				// here we have parsed the one line and thus now we should add
				// it to the
				// tokens for letter use
				tokens.add(wt);
			}
			if (tokens.size() > 0) {
				sentenceTokens.add(tokens);
			}
			input.close();

			// ok so here we have all the information we need from the minipar
			// in
			// local variables
			// ok so first we would create annotation for each word Token

			AnnotationSet annotSet = (annotationOutputSetName == null || annotationOutputSetName
					.equals("")) ? document.getAnnotations() : document
					.getAnnotations(annotationOutputSetName);

			// size of the sentenceTokens and the allSentences would be always
			// same
			for (int i = 0; i < sentenceTokens.size(); i++) {
				tokens = (ArrayList) sentenceTokens.get(i);

				// we need this to generate the offsets
				Annotation sentence = (Annotation) allSentences.get(i);
				String sentenceString = document.getContent().getContent(
						sentence.getStartNode().getOffset(),
						sentence.getEndNode().getOffset()).toString();

				sentenceString = sentenceString.replaceAll("\r", " ");
				sentenceString = sentenceString.replaceAll("\n", " ");

				// this will hold the position from where it should start
				// searching for
				// the token text
				int startOffset = sentence.getStartNode().getOffset()
						.intValue();

				int index = -1;
				for (int j = 0; j < tokens.size(); j++) {
					// each item here is a separate word token
					WordToken wt = (WordToken) tokens.get(j);
					// ok so find out the offsets
					int stOffset = sentenceString.toLowerCase().indexOf(
							wt.word.toLowerCase(), index)
							+ startOffset;
					int enOffset = stOffset + wt.word.length();
					FeatureMap map = Factory.newFeatureMap();
					map.put("word", wt.word);
					Integer id = annotSet.add(new Long(stOffset), new Long(
							enOffset), annotationTypeName, map);
					wt.annotation = annotSet.get(id);
					index = enOffset - startOffset;
				}
			}

			// now we need to create the children nodes
			for (int i = 0; i < sentenceTokens.size(); i++) {
				tokens = (ArrayList) sentenceTokens.get(i);

				for (int j = 0; j < tokens.size(); j++) {
					WordToken wt = (WordToken) tokens.get(j);
					// read the head node
					// find out the respective word token for that head node
					// and add the current node as its child
					if (wt.headNumber > 0) {
						WordToken headToken = (WordToken) tokens
								.get(wt.headNumber - 1);
						headToken.children.add(wt.annotation.getId());
					}
				}
			}

			// and finally we need to add features to the annotations
			// now we need to create the children nodes
			for (int i = 0; i < sentenceTokens.size(); i++) {
				tokens = (ArrayList) sentenceTokens.get(i);

				for (int j = 0; j < tokens.size(); j++) {
					// for every wordtoken,
					// we look for its head
					// and create annotations
					// the annotation will have the type of relation string
					// and as a features
					// it will have one head ID and one child ID
					// head ID is the id of head
					// and child ID is the id of wt
					WordToken wt = (WordToken) tokens.get(j);
					FeatureMap map = Factory.newFeatureMap();
					if (wt.headNumber > 0) {
						Annotation headAnn = ((WordToken) tokens
								.get(wt.headNumber - 1)).annotation;
						map.put("head_id", headAnn.getId());
						map.put("child_id", wt.annotation.getId());
						map.put("head_word", ((WordToken) tokens
								.get(wt.headNumber - 1)).word);
						map.put("child_word", wt.word);
						// so create the new annotation
						int stOffset1 = headAnn.getStartNode().getOffset()
								.intValue();
						int stOffset2 = wt.annotation.getStartNode()
								.getOffset().intValue();
						int enOffset1 = headAnn.getEndNode().getOffset()
								.intValue();
						int enOffset2 = wt.annotation.getEndNode().getOffset()
								.intValue();
						stOffset1 = stOffset1 < stOffset2 ? stOffset1
								: stOffset2;
						enOffset1 = enOffset1 > enOffset2 ? enOffset1
								: enOffset2;
						annotSet.add(new Long(stOffset1), new Long(enOffset1),
								wt.relationWithHead, map);
					}
				}
			}

			// and finally make the sentenceTokens and tokens to null
			tokens = null;
			sentenceTokens = null;
		} catch (Exception exception) {
			exception.printStackTrace();
			throw new ExecutionException(exception);
		}
	} // end of runMinipar()

	/**
	 * Execute method is called whenever user clicks on the "RUN" button in GATE
	 * GUI. It should be called after the instantiation of the resource. This
	 * method first checks for all mandatory runtime parameters, if not provided
	 * prompts user by throwing an executionexception. For optional parameters
	 * it considers the default values. this method initially given a document,
	 * converts the document text into a text file. This text file is then sent
	 * to runMinipar function in order to parse it with Minipar.
	 */
	public void execute() throws ExecutionException {
		if (document == null)
			throw new ExecutionException("No document to process!");
		if (getMiniparBinary() == null)
			throw new ExecutionException(
					"Please provide the URL for Minipar Binary");
		if (getMiniparDataDir() == null)
			throw new ExecutionException(
					"Minipar requires the location of its data directory "
							+ "(By default it is %Minipar_Home%/data");

    // generate a temporary file for the current document
    File gateTempFile = null;
    try {
    gateTempFile =  File.createTempFile( GATETEXTFILE, ".txt");
    gateTempFile.deleteOnExit();
    }
    catch ( java.io.IOException e){
      throw new ExecutionException("Impossible to generate temp file!");
    }
		// obtain GATE sentence annotations as a list
		ArrayList allSentences = saveGateSentences(gateTempFile);
		// finally runMinipar
		runMinipar(allSentences,gateTempFile);
	}

	/**
	 * WorkToken subclass is used as an internal data structure to hold
	 * temporary information returned by the minipar parser. This information is
	 * then mapped over the GATE annotations.
	 */
	private class WordToken {
		/**
		 * Word is the string, that represents the token in minipar
		 */
		String word;

		/**
		 * Each token in minipar is given a number. This contains the number of
		 * the headToken
		 */
		int headNumber;

		/**
		 * This stores the relation of head word with its children
		 */
		String relationWithHead;

		/**
		 * Contains other instances of WordTokens which have been identified as
		 * children of the headword
		 */
		ArrayList children = new ArrayList();

		/**
		 * GATE annotation that represents the WordToken in GATE
		 */
		gate.Annotation annotation;
	}

}