/* * KEAKeyphraseExtractor.java * Copyright (C) 2001 Eibe Frank * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package kea; import java.io.*; import java.util.*; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; import weka.core.OptionHandler; import weka.core.Utils; import weka.core.FastVector; import weka.core.Option; /** * Extracts keyphrases from the documents in a given directory. * Assumes that the file names for the documents end with ".txt". * Puts extracted keyphrases into corresponding files ending with * ".key" (if those are not already present). Optionally an encoding * for the documents/keyphrases can be defined (e.g. for Chinese * text). Documents for which ".key" exists, are used for evaluation. * * Valid options are:<p> * * -l "directory name"<br> * Specifies name of directory.<p> * * -m "model name"<br> * Specifies name of model.<p> * * -e "encoding"<br> * Specifies encoding.<p> * * -n <br> * Specifies number of phrases to be output (default: 5).<p> * * -d<br> * Turns debugging mode on.<p> * * -a<br> * Also write stemmed phrase and score into ".key" file.<p> * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version 1.0 */ public class KEAKeyphraseExtractor implements OptionHandler { /** Name of directory */ String m_dirName = null; /** Name of model */ String m_modelName = null; /** Encoding */ String m_encoding = "default"; /** Debugging mode? */ boolean m_debug = false; /** The KEA filter object */ KEAFilter m_KEAFilter = null; /** The number of phrases to extract. */ int m_numPhrases = 5; /** Also write stemmed phrase and score into .key file. */ boolean m_AdditionalInfo = false; /** * Get the value of AdditionalInfo. * * @return Value of AdditionalInfo. */ public boolean getAdditionalInfo() { return m_AdditionalInfo; } /** * Set the value of AdditionalInfo. * * @param newAdditionalInfo Value to assign to AdditionalInfo. */ public void setAdditionalInfo(boolean newAdditionalInfo) { m_AdditionalInfo = newAdditionalInfo; } /** * Get the value of numPhrases. * * @return Value of numPhrases. */ public int getNumPhrases() { return m_numPhrases; } /** * Set the value of numPhrases. * * @param newnumPhrases Value to assign to numPhrases. */ public void setNumPhrases(int newnumPhrases) { m_numPhrases = newnumPhrases; } /** * Get the value of debug. * * @return Value of debug. */ public boolean getDebug() { return m_debug; } /** * Set the value of debug. * * @param newdebug Value to assign to debug. */ public void setDebug(boolean newdebug) { m_debug = newdebug; } /** * Get the value of encoding. * * @return Value of encoding. */ public String getEncoding() { return m_encoding; } /** * Set the value of encoding. * * @param newencoding Value to assign to encoding. */ public void setEncoding(String newencoding) { m_encoding = newencoding; } /** * Get the value of modelName. * * @return Value of modelName. */ public String getModelName() { return m_modelName; } /** * Set the value of modelName. * * @param newmodelName Value to assign to modelName. */ public void setModelName(String newmodelName) { m_modelName = newmodelName; } /** * Get the value of dirName. * * @return Value of dirName. */ public String getDirName() { return m_dirName; } /** * Set the value of dirName. * * @param newdirName Value to assign to dirName. */ public void setDirName(String newdirName) { m_dirName = newdirName; } /** * Parses a given list of options controlling the behaviour of this object. * Valid options are:<p> * * -l "directory name"<br> * Specifies name of directory.<p> * * -m "model name"<br> * Specifies name of model.<p> * * -e "encoding"<br> * Specifies encoding.<p> * * -n<br> * Specifies number of phrases to be output (default: 5).<p> * * -d<br> * Turns debugging mode on.<p> * * -a<br> * Also write stemmed phrase and score into ".key" file.<p> * * @param options the list of options as an array of strings * @exception Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String dirName = Utils.getOption('l', options); if (dirName.length() > 0) { setDirName(dirName); } else { setDirName(null); throw new Exception("Name of directory required argument."); } String modelName = Utils.getOption('m', options); if (modelName.length() > 0) { setModelName(modelName); } else { setModelName(null); throw new Exception("Name of model required argument."); } String encoding = Utils.getOption('e', options); if (encoding.length() > 0) { setEncoding(encoding); } else { setEncoding("default"); } String numPhrases = Utils.getOption('n', options); if (numPhrases.length() > 0) { setNumPhrases(Integer.parseInt(numPhrases)); } else { setNumPhrases(5); } setDebug(Utils.getFlag('d', options)); setAdditionalInfo(Utils.getFlag('a', options)); Utils.checkForRemainingOptions(options); } /** * Gets the current option settings. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [10]; int current = 0; options[current++] = "-l"; options[current++] = "" + (getDirName()); options[current++] = "-m"; options[current++] = "" + (getModelName()); options[current++] = "-e"; options[current++] = "" + (getEncoding()); options[current++] = "-n"; options[current++] = "" + (getNumPhrases()); if (getDebug()) { options[current++] = "-d"; } if (getAdditionalInfo()) { options[current++] = "-a"; } while (current < options.length) { options[current++] = ""; } return options; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(6); newVector.addElement(new Option( "\tSpecifies name of directory.", "l", 1, "-l <directory name>")); newVector.addElement(new Option( "\tSpecifies name of model.", "m", 1, "-m <model name>")); newVector.addElement(new Option( "\tSpecifies encoding.", "e", 1, "-e <encoding>")); newVector.addElement(new Option( "\tSpecifies number of phrases to be output (default: 5).", "n", 1, "-n")); newVector.addElement(new Option( "\tTurns debugging mode on.", "d", 0, "-d")); newVector.addElement(new Option( "\tAlso write stemmed phrase and score into \".key\" file.", "a", 0, "-a")); return newVector.elements(); } /** * Collects the stems of the file names. */ public Hashtable collectStems() throws Exception { Hashtable stems = new Hashtable(); try { File dir = new File(m_dirName); String[] files = dir.list(); for (int i = 0; i < files.length; i++) { if (files[i].endsWith(".txt")) { String stem = files[i].substring(0, files[i].length() - 4); if (!stems.containsKey(stem)) { stems.put(stem, new Double(0)); } } } } catch (Exception e) { throw new Exception("Problem opening directory " + m_dirName); } return stems; } /** * Builds the model from the files */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector)null)); atts.addElement(new Attribute("keyphrases", (FastVector)null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Extract keyphrases Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String)elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char)c); } newInst[0] = (double)data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char)c); } newInst[1] = (double)data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; while ((inst = m_KEAFilter.output()) != null) { int index = (int)inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i]. isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int)topRankedInstances[i]. value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i]. attribute(topRankedInstances[i].numAttributes() - 1). indexOfValue("True")) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i]. stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i]. stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils. doubleToString(topRankedInstances[i]. value(m_KEAFilter. getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double)stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); m_KEAFilter.batchFinished(); } /** * Loads the extraction model from the file. */ public void loadModel() throws Exception { BufferedInputStream inStream = new BufferedInputStream(new FileInputStream(m_modelName)); ObjectInputStream in = new ObjectInputStream(inStream); m_KEAFilter = (KEAFilter)in.readObject(); in.close(); } /** * The main method. */ public static void main(String[] ops) { KEAKeyphraseExtractor kmb = new KEAKeyphraseExtractor(); try { kmb.setOptions(ops); System.err.print("Extracting keyphrases with options: "); String[] optionSettings = kmb.getOptions(); for (int i = 0; i < optionSettings.length; i++) { System.err.print(optionSettings[i] + " "); } System.err.println(); kmb.loadModel(); kmb.extractKeyphrases(kmb.collectStems()); } catch (Exception e) { e.printStackTrace(); System.err.println(e.getMessage()); System.err.println("\nOptions:\n"); Enumeration enumeration = kmb.listOptions(); while (enumeration.hasMoreElements()) { Option option = (Option) enumeration.nextElement(); System.err.println(option.synopsis()); System.err.println(option.description()); } } } }