GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Keyphrase_Extraction_Algorithm/src/kea/KEAKeyphraseExtractor.java

/*
 *    KEAKeyphraseExtractor.java
 *    Copyright (C) 2001 Eibe Frank
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
package kea;

import java.io.*;
import java.util.*;

import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.core.FastVector;
import weka.core.Option;

/**
 * Extracts keyphrases from the documents in a given directory.
 * Assumes that the file names for the documents end with ".txt".
 * Puts extracted keyphrases into corresponding files ending with
 * ".key" (if those are not already present). Optionally an encoding
 * for the documents/keyphrases can be defined (e.g. for Chinese
 * text). Documents for which ".key" exists, are used for evaluation.
 *
 * Valid options are:<p>
 *
 * -l "directory name"<br>
 * Specifies name of directory.<p>
 *
 * -m "model name"<br>
 * Specifies name of model.<p>
 *
 * -e "encoding"<br>
 * Specifies encoding.<p>
 *
 * -n <br>
 * Specifies number of phrases to be output (default: 5).<p>
 *
 * -d<br>
 * Turns debugging mode on.<p>
 *
 * -a<br>
 * Also write stemmed phrase and score into ".key" file.<p>
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
 */
public class KEAKeyphraseExtractor implements OptionHandler {
  
  /** Name of directory */
  String m_dirName = null;
  
  /** Name of model */
  String m_modelName = null;

  /** Encoding */
  String m_encoding = "default";

  /** Debugging mode? */
  boolean m_debug = false;

  /** The KEA filter object */
  KEAFilter m_KEAFilter = null;
  
  /** The number of phrases to extract. */
  int m_numPhrases = 5;

  /** Also write stemmed phrase and score into .key file. */
  boolean m_AdditionalInfo = false;
  
  /**
   * Get the value of AdditionalInfo.
   *
   * @return Value of AdditionalInfo.
   */
  public boolean getAdditionalInfo() {
    
    return m_AdditionalInfo;
  }
  
  /**
   * Set the value of AdditionalInfo.
   *
   * @param newAdditionalInfo Value to assign to AdditionalInfo.
   */
  public void setAdditionalInfo(boolean newAdditionalInfo) {
    
    m_AdditionalInfo = newAdditionalInfo;
  }
  
  /**
   * Get the value of numPhrases.
   *
   * @return Value of numPhrases.
   */
  public int getNumPhrases() {
    
    return m_numPhrases;
  }
  
  /**
   * Set the value of numPhrases.
   *
   * @param newnumPhrases Value to assign to numPhrases.
   */
  public void setNumPhrases(int newnumPhrases) {
    
    m_numPhrases = newnumPhrases;
  }
  
  /**
   * Get the value of debug.
   *
   * @return Value of debug.
   */
  public boolean getDebug() {
    
    return m_debug;
  }
  
  /**
   * Set the value of debug.
   *
   * @param newdebug Value to assign to debug.
   */
  public void setDebug(boolean newdebug) {
    
    m_debug = newdebug;
  }
  
  /**
   * Get the value of encoding.
   *
   * @return Value of encoding.
   */
  public String getEncoding() {
    
    return m_encoding;
  }
  
  /**
   * Set the value of encoding.
   *
   * @param newencoding Value to assign to encoding.
   */
  public void setEncoding(String newencoding) {
    
    m_encoding = newencoding;
  }
  
  /**
   * Get the value of modelName.
   *
   * @return Value of modelName.
   */
  public String getModelName() {
    
    return m_modelName;
  }
  
  /**
   * Set the value of modelName.
   *
   * @param newmodelName Value to assign to modelName.
   */
  public void setModelName(String newmodelName) {
    
    m_modelName = newmodelName;
  }
  
  /**
   * Get the value of dirName.
   *
   * @return Value of dirName.
   */
  public String getDirName() {
    
    return m_dirName;
  }
  
  /**
   * Set the value of dirName.
   *
   * @param newdirName Value to assign to dirName.
   */
  public void setDirName(String newdirName) {
    
    m_dirName = newdirName;
  }
  
  /**
   * Parses a given list of options controlling the behaviour of this object.
   * Valid options are:<p>
   *
   * -l "directory name"<br>
   * Specifies name of directory.<p>
   *
   * -m "model name"<br>
   * Specifies name of model.<p>
   *
   * -e "encoding"<br>
   * Specifies encoding.<p>
   *
   * -n<br>
   * Specifies number of phrases to be output (default: 5).<p>
   *
   * -d<br>
   * Turns debugging mode on.<p>
   *
   * -a<br>
   * Also write stemmed phrase and score into ".key" file.<p>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {

    String dirName = Utils.getOption('l', options);
    if (dirName.length() > 0) {
      setDirName(dirName);
    } else {
      setDirName(null);
      throw new Exception("Name of directory required argument.");
    }
    String modelName = Utils.getOption('m', options);
    if (modelName.length() > 0) {
      setModelName(modelName);
    } else {
      setModelName(null);
      throw new Exception("Name of model required argument.");
    }
    String encoding = Utils.getOption('e', options);
    if (encoding.length() > 0) {
      setEncoding(encoding);
    } else {
      setEncoding("default");
    }
    String numPhrases = Utils.getOption('n', options);
    if (numPhrases.length() > 0) {
      setNumPhrases(Integer.parseInt(numPhrases));
    } else {
      setNumPhrases(5);
    }
    setDebug(Utils.getFlag('d', options));
    setAdditionalInfo(Utils.getFlag('a', options));
    Utils.checkForRemainingOptions(options);
   }

  /**
   * Gets the current option settings.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String [] getOptions() {

    String [] options = new String [10];
    int current = 0;

    options[current++] = "-l"; 
    options[current++] = "" + (getDirName());
    options[current++] = "-m"; 
    options[current++] = "" + (getModelName());
    options[current++] = "-e"; 
    options[current++] = "" + (getEncoding());
    options[current++] = "-n"; 
    options[current++] = "" + (getNumPhrases());
    if (getDebug()) {
      options[current++] = "-d";
    }
    if (getAdditionalInfo()) {
      options[current++] = "-a";
    }

    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(6);

    newVector.addElement(new Option(
	      "\tSpecifies name of directory.",
              "l", 1, "-l <directory name>"));
    newVector.addElement(new Option(
	      "\tSpecifies name of model.",
              "m", 1, "-m <model name>"));
    newVector.addElement(new Option(
	      "\tSpecifies encoding.",
              "e", 1, "-e <encoding>"));
    newVector.addElement(new Option(
	      "\tSpecifies number of phrases to be output (default: 5).",
              "n", 1, "-n"));
    newVector.addElement(new Option(
	      "\tTurns debugging mode on.",
              "d", 0, "-d"));
    newVector.addElement(new Option(
	      "\tAlso write stemmed phrase and score into \".key\" file.",
              "a", 0, "-a"));

    return newVector.elements();
  }

  /**
   * Collects the stems of the file names.
   */
  public Hashtable collectStems() throws Exception {

    Hashtable stems = new Hashtable();

    try {
      File dir = new File(m_dirName);
      String[] files = dir.list();
      for (int i = 0; i < files.length; i++) {
	if (files[i].endsWith(".txt")) {
	  String stem = files[i].substring(0, files[i].length() - 4);
	  if (!stems.containsKey(stem)) {
	    stems.put(stem, new Double(0));
	  }
	}
      }
    } catch (Exception e) {
      throw new Exception("Problem opening directory " + m_dirName);
    }
    return stems;
  }

  /**
   * Builds the model from the files
   */
  public void extractKeyphrases(Hashtable stems) throws Exception {
    
    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
      throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector)null));
    atts.addElement(new Attribute("keyphrases", (FastVector)null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
      String str = (String)elem.nextElement();
      double[] newInst = new double[2];
      try {
	File txt = new File(m_dirName + "/" + str + ".txt");
	InputStreamReader is;
	if (!m_encoding.equals("default")) {
	  is = new InputStreamReader(new FileInputStream(txt), m_encoding);
	} else {
	  is = new InputStreamReader(new FileInputStream(txt));
	}
	StringBuffer txtStr = new StringBuffer();
	int c;
	while ((c = is.read()) != -1) {
	  txtStr.append((char)c);
	}
	newInst[0] = (double)data.attribute(0).addStringValue(txtStr.toString());
      } catch (Exception e) {
	if (m_debug) {
	  System.err.println("Can't read document " + str + ".txt");
	}
	newInst[0] = Instance.missingValue();
      }
      try {
	File key = new File(m_dirName + "/" + str + ".key");
	InputStreamReader is; 
	if (!m_encoding.equals("default")) {
	  is = new InputStreamReader(new FileInputStream(key), m_encoding);
	} else {
	  is = new InputStreamReader(new FileInputStream(key));
	}
	StringBuffer keyStr = new StringBuffer();
	int c;
	while ((c = is.read()) != -1) {
	  keyStr.append((char)c);
	}      
	newInst[1] = (double)data.attribute(1).addStringValue(keyStr.toString());
      } catch (Exception e) {
	if (m_debug) {
	  System.err.println("No keyphrases for stem " + str + ".");
	}
	newInst[1] = Instance.missingValue();
      }
      data.add(new Instance(1.0, newInst));
      m_KEAFilter.input(data.instance(0));
      data = data.stringFreeStructure();
      if (m_debug) {
	System.err.println("-- Document: " + str);
      }
      Instance[] topRankedInstances = new Instance[m_numPhrases];
      Instance inst;
      while ((inst = m_KEAFilter.output()) != null) {
	int index = (int)inst.value(m_KEAFilter.getRankIndex()) - 1;
	if (index < m_numPhrases) {
	  topRankedInstances[index] = inst;
	}
      }
      if (m_debug) {
	System.err.println("-- Keyphrases and feature values:");
      }
      FileOutputStream out = null;
      PrintWriter printer = null; 
      File key = new File(m_dirName + "/" + str + ".key");
      if (!key.exists()) {
	out = new FileOutputStream(m_dirName + "/" + str + ".key");
	if (!m_encoding.equals("default")) {
	  printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
	} else {
	  printer = new PrintWriter(out);
	}
      }
      double numExtracted = 0, numCorrect = 0;
      for (int i = 0; i < m_numPhrases; i++) {
	if (topRankedInstances[i] != null) {
	  if (!topRankedInstances[i].
	      isMissing(topRankedInstances[i].numAttributes() - 1)) {
	    numExtracted += 1.0;
	  }
	  if ((int)topRankedInstances[i].
	      value(topRankedInstances[i].numAttributes() - 1) == 
	      topRankedInstances[i].
	      attribute(topRankedInstances[i].numAttributes() - 1).
	      indexOfValue("True")) {
	    numCorrect += 1.0;
	  }
	  if (printer != null) {
	    printer.print(topRankedInstances[i].
			  stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
	    if (m_AdditionalInfo) {
	      printer.print("\t");
	      printer.print(topRankedInstances[i].
			  stringValue(m_KEAFilter.getStemmedPhraseIndex()));
	      printer.print("\t");
	      printer.print(Utils.
			    doubleToString(topRankedInstances[i].
					   value(m_KEAFilter.
						 getProbabilityIndex()), 4));
	    }
	    printer.println();
	  }
	  if (m_debug) {
	    System.err.println(topRankedInstances[i]);
	  }
	}
      }
      if (numExtracted > 0) {
	if (m_debug) {
	  System.err.println("-- " + numCorrect + " correct");
	}
	stats.addElement(new Double(numCorrect));
      }
      if (printer != null) {
	printer.flush();
	printer.close();
	out.close();
      }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
      st[i] = ((Double)stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println("Avg. number of correct keyphrases: " +
		       Utils.doubleToString(avg, 2) + " +/- " + 
		       Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
  }
  
  /** 
   * Loads the extraction model from the file.
   */
  public void loadModel() throws Exception {
    
    BufferedInputStream inStream =
      new BufferedInputStream(new FileInputStream(m_modelName));
    ObjectInputStream in = new ObjectInputStream(inStream);
    m_KEAFilter = (KEAFilter)in.readObject();
    in.close();
  }
  
  /**
   * The main method.  
   */
  public static void main(String[] ops) {
    
    KEAKeyphraseExtractor kmb = new KEAKeyphraseExtractor();
    try {
      kmb.setOptions(ops);
      System.err.print("Extracting keyphrases with options: ");
      String[] optionSettings = kmb.getOptions();
      for (int i = 0; i < optionSettings.length; i++) {
	System.err.print(optionSettings[i] + " ");
      }
      System.err.println();
      kmb.loadModel();
      kmb.extractKeyphrases(kmb.collectStems());
    } catch (Exception e) {
      e.printStackTrace();
      System.err.println(e.getMessage());
      System.err.println("\nOptions:\n");
      Enumeration enumeration = kmb.listOptions();
      while (enumeration.hasMoreElements()) {
	Option option = (Option) enumeration.nextElement();
	System.err.println(option.synopsis());
	System.err.println(option.description());
      }
    }
  }
}