GATE.ac.uk - gate/plugins/Keyphrase_Extraction_Algorithm/src/kea/KEAFilter.java

/*
 *    KEAFilter.java
 *    Copyright (C) 2000, 2001 Eibe Frank
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package kea;

import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.Vector;

import weka.classifiers.Classifier;
import weka.classifiers.bayes.NaiveBayesSimple;
import weka.classifiers.meta.FilteredClassifier;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;

/**
 * This filter converts the incoming data into data appropriate for
 * keyphrase classification. It assumes that the dataset contains two
 * string attributes. The first attribute should contain the text of a
 * document. The second attribute should contain the keyphrases
 * associated with that document (if present). 
 *
 * The filter converts every instance (i.e. document) into a set of
 * instances, one for each word-based n-gram in the document. The
 * string attribute representing the document is replaced by some
 * numeric features, the estimated probability of each n-gram being a
 * keyphrase, and the rank of this phrase in the document according to
 * the probability.  Each new instances also has a class value
 * associated with it. The class is "true" if the n-gram is a true
 * keyphrase, and "false" otherwise. Of course, if the input document
 * doesn't come with author-assigned keyphrases, the class values for
 * that document will be missing.  
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
  */
@SuppressWarnings({"serial","rawtypes","unchecked","cast","unused"})
public class KEAFilter extends Filter implements OptionHandler {

  /** Index of attribute containing the documents */
  private int m_DocumentAtt = 0;

  /** Index of attribute containing the keyphrases */
  private int m_KeyphrasesAtt = 1;

  /** The maximum length of phrases */
  private int m_MaxPhraseLength = 3;

  /** The minimum length of phrases */
  private int m_MinPhraseLength = 1;

  /** Is keyphrase frequency attribute being used? */
  private boolean m_KFused = false;

  /** Flag for debugging mode */
  private boolean m_Debug = false;

  /** Determines whether internal periods are allowed */
  private boolean m_DisallowInternalPeriods = false;

  /** The minimum number of occurences of a phrase */
  private int m_MinNumOccur = 2;

  /** The number of features describing a phrase */
  private int m_NumFeatures = 2;

  /* Indices of attributes in m_ClassifierData */
  private int m_TfidfIndex = 0;
  private int m_FirstOccurIndex = 1;
  private int m_KeyFreqIndex = 2;

  /** The punctuation filter used by this filter */
  private KEAPhraseFilter m_PunctFilter = null;

  /** The numbers filter used by this filter */
  private NumbersFilter m_NumbersFilter = null;
 
  /** The actual classifier used to compute probabilities */
  private Classifier m_Classifier = null;

  /** The dictionary containing the document frequencies */
  private HashMap m_Dictionary = null;

  /** The dictionary containing the keyphrases */
  private HashMap m_KeyphraseDictionary = null;

  /** The number of documents in the global frequencies corpus */
  private int m_NumDocs = 0;

  /** Template for the classifier data */
  private Instances m_ClassifierData = null;

  /** The stemmer to be used */
  private Stemmer m_Stemmer = new IteratedLovinsStemmer();

  /** The list of stop words to be used */
  private Stopwords m_Stopwords = new StopwordsEnglish();

  /** Determines whether check for proper nouns is performed */
  private boolean m_CheckForProperNouns = true;

  /**
   * Get the M_CheckProperNouns value.
   * @return the M_CheckProperNouns value.
   */
  public boolean getCheckForProperNouns() {
    return m_CheckForProperNouns;
  }

  /**
   * Set the M_CheckProperNouns value.
   * @param newM_CheckProperNouns The new M_CheckProperNouns value.
   */
  public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
    this.m_CheckForProperNouns = newM_CheckProperNouns;
  }
  
  /**
   * Get the M_Stopwords value.
   * @return the M_Stopwords value.
   */
  public Stopwords getStopwords() {
    return m_Stopwords;
  }

  /**
   * Set the M_Stopwords value.
   * @param newM_Stopwords The new M_Stopwords value.
   */
  public void setStopwords(Stopwords newM_Stopwords) {
    this.m_Stopwords = newM_Stopwords;
  }

  
  /**
   * Get the Stemmer value.
   * @return the Stemmer value.
   */
  public Stemmer getStemmer() {

    return m_Stemmer;
  }

  /**
   * Set the Stemmer value.
   * @param newStemmer The new Stemmer value.
   */
  public void setStemmer(Stemmer newStemmer) {

    this.m_Stemmer = newStemmer;
  }

  /**
   * Get the value of MinNumOccur.
   *
   * @return Value of MinNumOccur.
   */
  public int getMinNumOccur() {
    
    return m_MinNumOccur;
  }
  
  /**
   * Set the value of MinNumOccur.
   *
   * @param newMinNumOccur Value to assign to MinNumOccur.
   */
  public void setMinNumOccur(int newMinNumOccur) {
    
    m_MinNumOccur = newMinNumOccur;
  }
  
  /**
   * Get the value of MaxPhraseLength.
   *
   * @return Value of MaxPhraseLength.
   */
  public int getMaxPhraseLength() {
    
    return m_MaxPhraseLength;
  }
  
  /**
   * Set the value of MaxPhraseLength.
   *
   * @param newMaxPhraseLength Value to assign to MaxPhraseLength.
   */
  public void setMaxPhraseLength(int newMaxPhraseLength) {
    
    m_MaxPhraseLength = newMaxPhraseLength;
  }
  
  /**
   * Get the value of MinPhraseLength.
   *
   * @return Value of MinPhraseLength.
   */
  public int getMinPhraseLength() {
    
    return m_MinPhraseLength;
  }
  
  /**
   * Set the value of MinPhraseLength.
   *
   * @param newMinPhraseLength Value to assign to MinPhraseLength.
   */
  public void setMinPhraseLength(int newMinPhraseLength) {
    
    m_MinPhraseLength = newMinPhraseLength;
  }

  /**
   * Returns the index of the stemmed phrases in the output ARFF file.
   */
  public int getStemmedPhraseIndex() {

    return m_DocumentAtt;
  }

  /**
   * Returns the index of the unstemmed phrases in the output ARFF file.
   */
  public int getUnstemmedPhraseIndex() {

    return m_DocumentAtt + 1;
  }

  /**
   * Returns the index of the phrases' probabilities in the output ARFF file.
   */
  public int getProbabilityIndex() {

    int index = m_DocumentAtt + 4;

    if (m_Debug) {
      if (m_KFused) {
	index++;
      }
    }
    return index;
  }

  /**
   * Returns the index of the phrases' ranks in the output ARFF file.
   */
  public int getRankIndex() {

    return getProbabilityIndex() + 1;
  }

  /**
   * Get the value of DocumentAtt.
   *
   * @return Value of DocumentAtt.
   */
  public int getDocumentAtt() {
    
    return m_DocumentAtt;
  }
  
  /**
   * Set the value of DocumentAtt.
   *
   * @param newDocumentAtt Value to assign to DocumentAtt.
   */
  public void setDocumentAtt(int newDocumentAtt) {
    
    m_DocumentAtt = newDocumentAtt;
  }
  
  /**
   * Get the value of KeyphraseAtt.
   *
   * @return Value of KeyphraseAtt.
   */
  public int getKeyphrasesAtt() {
    
    return m_KeyphrasesAtt;
  }
  
  /**
   * Set the value of KeyphrasesAtt.
   *
   * @param newKeyphrasesAtt Value to assign to KeyphrasesAtt.
   */
  public void setKeyphrasesAtt(int newKeyphrasesAtt) {
    
    m_KeyphrasesAtt = newKeyphrasesAtt;
  }
  
  
  /**
   * Get the value of Debug.
   *
   * @return Value of Debug.
   */
  public boolean getDebug() {
    
    return m_Debug;
  }
  
  /**
   * Set the value of Debug.
   *
   * @param newDebug Value to assign to Debug.
   */
  public void setDebug(boolean newDebug) {
    
    m_Debug = newDebug;
  }

  /**
   * Sets whether keyphrase frequency attribute is used.
   */
  public void setKFused(boolean flag) {

    m_KFused = flag;
    if (flag) {
      m_NumFeatures = 3;
    } else {
      m_NumFeatures = 2;
    }
  }

  /**
   * Gets whether keyphrase frequency attribute is used.
   */
  public boolean getKFused() {

    return m_KFused;
  }

  /**
   * Get whether the supplied columns are to be processed
   *
   * @return true if the supplied columns won't be processed
   */
  public boolean getDisallowInternalPeriods() {

    return m_DisallowInternalPeriods;
  }

  /**
   * Set whether selected columns should be processed. If true the 
   * selected columns won't be processed.
   *
   * @param invert the new invert setting
   */
  public void setDisallowInternalPeriods(boolean disallow) {

    m_DisallowInternalPeriods = disallow;
  }

  /**
   * Parses a given list of options controlling the behaviour of this object.
   * Valid options are:<p>
   *
   * -K<br>
   * Specifies whether keyphrase frequency statistic is used.<p>
   *
   * -M length<br>
   * Sets the maximum phrase length (default: 3).<p>
   *
   * -L length<br>
   * Sets the minimum phrase length (default: 1).<p>
   *
   * -D<br>
   * Turns debugging mode on.<p>
   *
   * -I index<br>
   * Sets the index of the attribute containing the documents (default: 0).<p>
   *
   * -J index<br>
   * Sets the index of the attribute containing the keyphrases (default: 1).<p>
   *
   * -P<br>
   * Disallow internal periods <p>
   *
   * -O number<br>
   * The minimum number of times a phrase needs to occur (default: 2). <p>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {

    setKFused(Utils.getFlag('K', options));
    setDebug(Utils.getFlag('D', options));
    String docAttIndexString = Utils.getOption('I', options);
    if (docAttIndexString.length() > 0) {
      setDocumentAtt(Integer.parseInt(docAttIndexString) - 1);
    } else {
      setDocumentAtt(0);
    }
    String keyphraseAttIndexString = Utils.getOption('J', options);
    if (keyphraseAttIndexString.length() > 0) {
      setKeyphrasesAtt(Integer.parseInt(keyphraseAttIndexString) - 1);
    } else {
      setKeyphrasesAtt(1);
    }
    String maxPhraseLengthString = Utils.getOption('M', options);
    if (maxPhraseLengthString.length() > 0) {
      setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
    } else {
      setMaxPhraseLength(3);
    }
    String minPhraseLengthString = Utils.getOption('M', options);
    if (minPhraseLengthString.length() > 0) {
      setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
    } else {
      setMinPhraseLength(1);
    }
    String minNumOccurString = Utils.getOption('O', options);
    if (minNumOccurString.length() > 0) {
      setMinNumOccur(Integer.parseInt(minNumOccurString));
    } else {
      setMinNumOccur(2);
    }
    setDisallowInternalPeriods(Utils.getFlag('P', options));
   }

  /**
   * Gets the current settings of the filter.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String [] getOptions() {

    String [] options = new String [13];
    int current = 0;

    if (getKFused()) {
      options[current++] = "-K";
    }
    if (getDebug()) {
      options[current++] = "-D";
    }
    options[current++] = "-I"; 
    options[current++] = "" + (getDocumentAtt() + 1);
    options[current++] = "-J"; 
    options[current++] = "" + (getKeyphrasesAtt() + 1);
    options[current++] = "-M"; 
    options[current++] = "" + (getMaxPhraseLength());
    options[current++] = "-L"; 
    options[current++] = "" + (getMinPhraseLength());
    options[current++] = "-O"; 
    options[current++] = "" + (getMinNumOccur());

    if (getDisallowInternalPeriods()) {
      options[current++] = "-P";
    }

    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }

  /**
   * Returns an enumeration describing the available options
   *
   * @return an enumeration of all the available options
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(7);

    newVector.addElement(new Option(
	      "\tSpecifies whether keyphrase frequency statistic is used.",
              "K", 0, "-K"));
    newVector.addElement(new Option(
	      "\tSets the maximum phrase length (default: 3).",
              "M", 1, "-M <length>"));
    newVector.addElement(new Option(
	      "\tSets the minimum phrase length (default: 1).",
              "L", 1, "-L <length>"));
    newVector.addElement(new Option(
	      "\tTurns debugging mode on.",
              "D", 0, "-D"));
    newVector.addElement(new Option(
	      "\tSets the index of the document attribute (default: 0).",
              "I", 1, "-I"));
    newVector.addElement(new Option(
	      "\tSets the index of the keyphrase attribute (default: 1).",
              "J", 1, "-J"));
    newVector.addElement(new Option(
	      "\tDisallow internal periods.",
              "P", 0, "-P"));
    newVector.addElement(new Option(
	      "\tSet the minimum number of occurences (default: 2).",
              "O", 1, "-O"));

    return newVector.elements();
  }

  /**
   * Returns a string describing this filter
   *
   * @return a description of the filter suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Converts incoming data into data appropriate for " +
      "keyphrase classification.";
  }

  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input
   * instance structure (any instances contained in the object are
   * ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately 
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    if (instanceInfo.classIndex() >= 0) {
      throw new Exception("Don't know what do to if class index set!");
    }
    if (!instanceInfo.attribute(m_KeyphrasesAtt).isString() ||
	!instanceInfo.attribute(m_DocumentAtt).isString()) {
      throw new Exception("Keyphrase attribute and document attribute " +
			  "need to be string attributes.");
    }
    m_PunctFilter = new KEAPhraseFilter();
    int[] arr = new int[1];
    arr[0] = m_DocumentAtt;
    m_PunctFilter.setAttributeIndicesArray(arr);
    m_PunctFilter.setInputFormat(instanceInfo);
    m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());
    m_NumbersFilter = new NumbersFilter();
    m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
    super.setInputFormat(m_NumbersFilter.getOutputFormat());
    return false;
  }

  /**
   * Input an instance for filtering. Ordinarily the instance is processed
   * and made available for output immediately. Some filters require all
   * instances be read before producing output.
   *
   * @param instance the input instance
   * @return true if the filtered instance may now be
   * collected with output().
   * @exception Exception if the input instance was not of the correct 
   * format or if there was a problem with the filtering.
   */
  public boolean input(Instance instance) throws Exception {

    if (getInputFormat() == null) {
      throw new Exception("No input instance format defined");
    }
    if (m_NewBatch) {
      resetQueue();
      m_NewBatch = false;
    }

    if (m_Debug) {
      System.err.println("-- Reading instance");
    }

    m_PunctFilter.input(instance);
    m_PunctFilter.batchFinished();
    instance = m_PunctFilter.output();

    m_NumbersFilter.input(instance);
    m_NumbersFilter.batchFinished();
    instance = m_NumbersFilter.output();
    
    if (m_Dictionary == null) {
      bufferInput(instance);
      return false;
    } else {
      FastVector vector = convertInstance(instance, false);
      Enumeration enumeration = vector.elements();
      while (enumeration.hasMoreElements()) {
	Instance inst = (Instance)enumeration.nextElement();
	push(inst);
      }
      return true;
    }
  }

  /**
   * Signify that this batch of input to the filter is finished. 
   * If the filter requires all instances prior to filtering,
   * output() may now be called to retrieve the filtered instances.
   *
   * @return true if there are instances pending output
   * @exception Exception if no input structure has been defined
   */
  public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
      throw new Exception("No input instance format defined");
    }
    if (m_Dictionary == null) {
      buildGlobalDictionaries();
      buildClassifier();
      convertPendingInstances();
    } 
    flushInput();
    m_NewBatch = true;
    return (numPendingOutput() != 0);
  }

  /**
   * Builds the global dictionaries.
   */
  private void buildGlobalDictionaries() throws Exception {

    if (m_Debug) {
      System.err.println("--- Building global dictionaries");
    }
    
    // Build dictionary of n-grams with associated
    // document frequencies
    m_Dictionary = new HashMap();
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      String str = getInputFormat().instance(i).stringValue(m_DocumentAtt);
      HashMap hash = getPhrasesForDictionary(str);
      Iterator it = hash.keySet().iterator();
      while (it.hasNext()) {
	String phrase = (String)it.next();
	Counter counter = (Counter)m_Dictionary.get(phrase);
	if (counter == null) {
	  m_Dictionary.put(phrase, new Counter());
	} else {
	  counter.increment();
	}
      }
    }
    
    if (m_KFused) {
      
      // Build dictionary of n-grams that occur as keyphrases
      // with associated keyphrase frequencies
      m_KeyphraseDictionary = new HashMap();
      for (int i = 0; i < getInputFormat().numInstances(); i++) {
	String str = getInputFormat().instance(i).stringValue(m_KeyphrasesAtt);
	HashMap hash = getGivenKeyphrases(str, false);
	if (hash != null) {
	  Iterator it = hash.keySet().iterator();
	  while (it.hasNext()) {
	    String phrase = (String)it.next();
	    Counter counter = (Counter)m_KeyphraseDictionary.get(phrase);
	    if (counter == null) {
	      m_KeyphraseDictionary.put(phrase, new Counter());
	    } else {
	      counter.increment();
	    }
	  }
	}
      }
    } else {
      m_KeyphraseDictionary = null;
    }
    
    // Set the number of documents in the global corpus
    m_NumDocs = getInputFormat().numInstances();
  }
 
  /**
   * Builds the classifier.
   */
  private void buildClassifier() throws Exception {
    
    // Generate input format for classifier
    FastVector atts = new FastVector();
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (i == m_DocumentAtt) {
	atts.addElement(new Attribute("TFxIDF"));
	atts.addElement(new Attribute("First_occurrence"));
	if (m_KFused) {
	  atts.addElement(new Attribute("Keyphrase_frequency"));
	}
      } else if (i == m_KeyphrasesAtt) {
	FastVector vals = new FastVector(2);
	vals.addElement("False");
	vals.addElement("True");
	atts.addElement(new Attribute("Keyphrase?", vals));
      } 
    }
    m_ClassifierData = new Instances("ClassifierData", atts, 0);
    m_ClassifierData.setClassIndex(m_NumFeatures);
    
    if (m_Debug) {
      System.err.println("--- Converting instances for classifier");
    }
    
    // Convert pending input instances into data for classifier
    for(int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance current = getInputFormat().instance(i);

      // Get the key phrases for the document
      String keyphrases = current.stringValue(m_KeyphrasesAtt);
      HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false);
      HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true);

      // Get the phrases for the document
      HashMap hash = new HashMap();
      int length = getPhrases(hash, current.stringValue(m_DocumentAtt));

      // Compute the feature values for each phrase and
      // add the instance to the data for the classifier
      Iterator it = hash.keySet().iterator();
      while (it.hasNext()) {
	String phrase = (String)it.next();
	FastVector phraseInfo = (FastVector)hash.get(phrase);
	double[] vals =  featVals(phrase, phraseInfo, true,
				  hashKeysEval, hashKeyphrases, length);
	Instance inst = new Instance(current.weight(), vals);
	m_ClassifierData.add(inst);
      }
    }
    
    if (m_Debug) {
      System.err.println("--- Building classifier");
    }
    
    // Build classifier
    FilteredClassifier fclass = new FilteredClassifier();
    fclass.setClassifier(new NaiveBayesSimple());
    fclass.setFilter(new Discretize());
    m_Classifier = fclass;
    m_Classifier.buildClassifier(m_ClassifierData);

    if (m_Debug) {
      System.err.println(m_Classifier);
    }

    // Save space
    m_ClassifierData = new Instances(m_ClassifierData, 0);
  }

  /** 
   * Conmputes the feature values for a given phrase.
   */
  private double[] featVals(String phrase, FastVector phraseInfo, 
			    boolean training, HashMap hashKeysEval,
			    HashMap hashKeyphrases, int length) {

    // Compute feature values
    Counter counterLocal = (Counter)phraseInfo.elementAt(1);
    double[] newInst = new double[m_NumFeatures + 1];
	  
    // Compute TFxIDF
    Counter counterGlobal = (Counter)m_Dictionary.get(phrase);
    double localVal = counterLocal.value(), globalVal = 0;
    if (counterGlobal != null) {
      globalVal = counterGlobal.value();
      if (training) {
	globalVal = globalVal - 1;
      }
    }
    
    // Just devide by length to get approximation of probability
    // that phrase in document is our phrase
    newInst[m_TfidfIndex] = (localVal / ((double)length)) *
      (-Math.log((globalVal + 1)/ ((double)m_NumDocs + 1)));
	  
    // Compute first occurrence
    Counter counterFirst = (Counter)phraseInfo.elementAt(0);
    newInst[m_FirstOccurIndex] = (double)counterFirst.value() /
      (double)length;
    
    // Is keyphrase frequency attribute being used?
    if (m_KFused) {
      Counter keyphraseC = (Counter)m_KeyphraseDictionary.get(phrase);
      if ((training) && (hashKeyphrases != null) &&
	  (hashKeyphrases.containsKey(phrase))) {
	newInst[m_KeyFreqIndex] = keyphraseC.value() - 1;
      } else {
	if (keyphraseC != null) {
	  newInst[m_KeyFreqIndex] = keyphraseC.value();
	} else {
	  newInst[m_KeyFreqIndex] = 0;
	}
      }
    }

    // Compute class value
    String phraseInEvalFormat = evalFormat((String)phraseInfo.elementAt(2));
    if (hashKeysEval == null) { // no author-assigned keyphrases
      newInst[m_NumFeatures] = Instance.missingValue();
    } else if (!hashKeysEval.containsKey(phraseInEvalFormat)) {
      newInst[m_NumFeatures] = 0; // No keyphrase
    } else {
      hashKeysEval.remove(phraseInEvalFormat);
      newInst[m_NumFeatures] = 1; // Keyphrase
    }
    return newInst;
  }

  /**
   * Sets output format and converts pending input instances.
   */
  private void convertPendingInstances() throws Exception {

    if (m_Debug) {
      System.err.println("--- Converting pending instances");
    }
    
    // Create output format for filter
    FastVector atts = new FastVector();
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (i == m_DocumentAtt) {
	atts.addElement(new Attribute("N-gram", (FastVector)null));
	atts.addElement(new Attribute("N-gram-original", (FastVector)null));
	atts.addElement(new Attribute("TFxIDF"));
	atts.addElement(new Attribute("First_occurrence"));
	if (m_Debug) {
	  if (m_KFused) {
	    atts.addElement(new Attribute("Keyphrase_frequency"));
	  }
	}
	atts.addElement(new Attribute("Probability"));
	atts.addElement(new Attribute("Rank"));
      } else if (i == m_KeyphrasesAtt) {
	FastVector vals = new FastVector(2);
	vals.addElement("False");
	vals.addElement("True");
	atts.addElement(new Attribute("Keyphrase?", vals));
      } else {
	atts.addElement(getInputFormat().attribute(i));
      }
    }
    Instances outFormat = new Instances("KEAdata", atts, 0);
    setOutputFormat(outFormat);
    
    // Convert pending input instances into output data
    for(int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance current = getInputFormat().instance(i);
      FastVector vector = convertInstance(current, true);
      Enumeration enumeration = vector.elements();
      while (enumeration.hasMoreElements()) {
	Instance inst = (Instance)enumeration.nextElement();
	push(inst);
      }
    }
  } 

  /**
   * Converts an instance.
   */
  private FastVector convertInstance(Instance instance, boolean training) 
    throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
      System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
      String keyphrases = instance.stringValue(m_KeyphrasesAtt);
      hashKeyphrases = getGivenKeyphrases(keyphrases, false);
      hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
      if (m_KFused) {
	numFeatures = numFeatures + 1;
      }
    } 

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
      String phrase = (String)it.next();
      FastVector phraseInfo = (FastVector)hash.get(phrase);
      double[] vals =  featVals(phrase, phraseInfo, training,
				hashKeysEval, hashKeyphrases, length);
      Instance inst = new Instance(instance.weight(), vals);
      inst.setDataset(m_ClassifierData);

      // Get probability of phrase being key phrase
      double[] probs = m_Classifier.distributionForInstance(inst);
      double prob = probs[1];

      // Compute attribute values for final instance
      double[] newInst = 
	new double[instance.numAttributes() + numFeatures];
      int pos = 0;
      for (int i = 0; i < instance.numAttributes(); i++) {
	if (i == m_DocumentAtt) {
	    
	  // Add phrase
	  int index = outputFormatPeek().attribute(pos).
	    addStringValue(phrase);
	  newInst[pos++] = index;
	  
	  // Add original version
	  index = outputFormatPeek().attribute(pos).
	    addStringValue((String)phraseInfo.elementAt(2));
	  newInst[pos++] = index;

	  // Add TFxIDF
	  newInst[pos++] = inst.value(m_TfidfIndex);
	    
	  // Add distance
	  newInst[pos++] = inst.value(m_FirstOccurIndex);

	    // Add other features
	  if (m_Debug) {
	    if (m_KFused) {
	      newInst[pos++] = inst.value(m_KeyFreqIndex);
	    }
	  }

	  // Add probability 
	  probsAttIndex = pos;
	  newInst[pos++] = prob;

	  // Set rank to missing (computed below)
	  newInst[pos++] = Instance.missingValue();
	} else if (i == m_KeyphrasesAtt) {
	  newInst[pos++] = inst.classValue();
	} else {
	  newInst[pos++] = instance.value(i);
	}
      }
      Instance ins = new Instance(instance.weight(), newInst);
      ins.setDataset(outputFormatPeek());
      vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
      Iterator phrases = hashKeysEval.keySet().iterator();
      while (phrases.hasNext()) {
	String phrase = (String)phrases.next();
	double[] newInst = 
	  new double[instance.numAttributes() + numFeatures];
	int pos = 0;
	for (int i = 0; i < instance.numAttributes(); i++) {
	  if (i == m_DocumentAtt) {
	    
	    // Add phrase
	    int index = outputFormatPeek().attribute(pos).
	      addStringValue(phrase);
	    newInst[pos++] = (double)index;
	    
	    // Add original version
	    index = outputFormatPeek().attribute(pos).
	      addStringValue((String)hashKeysEval.get(phrase));
	    newInst[pos++] = (double)index;
	    
	    // Add TFxIDF
	    newInst[pos++] = Instance.missingValue();
	    
	    // Add distance
	    newInst[pos++] = Instance.missingValue();
	      
	    // Add other features
	    if (m_Debug) {
	      if (m_KFused) {
		newInst[pos++] = Instance.missingValue();
	      }
	    }

	    // Add probability and rank
	    newInst[pos++] = -Double.MAX_VALUE;
	    newInst[pos++] = Instance.missingValue();
	  } else if (i == m_KeyphrasesAtt) {
	    newInst[pos++] = 1; // Keyphrase
	  } else {
	    newInst[pos++] = instance.value(i);
	  } 
	}
	Instance inst = new Instance(instance.weight(), newInst);
	inst.setDataset(outputFormatPeek());
	vector.addElement(inst);
      }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
      vals[i] = ((Instance)vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
      newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
      vals[i] = -((Instance)vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
      newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
      vals[i] = 1 - ((Instance)vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
      newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
      Instance currentInstance = (Instance)vector.elementAt(i);

      // Short cut: if phrase very unlikely make rank very low and continue
      if (Utils.grOrEq(vals[i], 1.0)) {
	currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
	continue;
      }

      // Otherwise look for super phrase starting with first phrase
      // in list that has same probability, TFxIDF value, and distance as
      // current phrase. We do this to catch all superphrases
      // that have same probability, TFxIDF value and distance as current phrase.
      int startInd = i;
      while (startInd < vals.length) {
	Instance inst = (Instance)vector.elementAt(startInd);
	if ((inst.value(tfidfAttIndex) != 
	     currentInstance.value(tfidfAttIndex)) ||
	    (inst.value(probsAttIndex) != 
	     currentInstance.value(probsAttIndex)) ||
	    (inst.value(distAttIndex) !=
	     currentInstance.value(distAttIndex))) {
	  break;
	}
	startInd++;
      }
      String val = currentInstance.stringValue(phraseAttIndex);
      boolean foundSuperphrase = false;
      for (int j = startInd - 1; j >= 0; j--) {
	if (j != i) {
	  Instance candidate = (Instance)vector.elementAt(j);
	  String potSuperphrase = candidate.stringValue(phraseAttIndex);
	  if (val.length() <= potSuperphrase.length()) {
	    if (KEAFilter.contains(val, potSuperphrase)) {
	      foundSuperphrase = true;
	      break;
	    }
	  }
	}
      }
      if (foundSuperphrase) {
	currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
      } else {
	currentInstance.setValue(probsAttIndex + 1, rank++);
      }
    }
    return vector;
  }
    
  /** 
   * Checks whether one phrase is a subphrase of another phrase.
   */
  private static boolean contains(String sub, String sup) {

    int i = 0;
    while (i + sub.length() - 1 < sup.length()) {
      int j;
      for (j = 0; j < sub.length(); j++) {
	if (sub.charAt(j) != sup.charAt(i + j)) {
	  break;
	}
      }
      if (j == sub.length()) {
	if ((i + j) < sup.length()) {
	  if (sup.charAt(i + j) == ' ') {
	    return true;
	  } else {
	    return false;
	  }
	} else {
	  return true;
	}
      }

      // Skip forward to next space
      do {
	i++;
      } while ((i < sup.length()) && (sup.charAt(i) != ' '));
      i++;
    }
    return false;
  }

  /**
   * Returns a hashtable. Fills the hashtable
   * with the stemmed n-grams occuring in the given string
   * (as keys) and the number of times it occurs.
   */
  private HashMap getPhrasesForDictionary(String str) {

    String[] buffer = new String[m_MaxPhraseLength];
    HashMap hash = new HashMap();

    StringTokenizer tok = new StringTokenizer(str, "\n");
    while (tok.hasMoreTokens()) {
      String phrase = tok.nextToken();
      int numSeen = 0;
      StringTokenizer wordTok = new StringTokenizer(phrase, " ");
      while (wordTok.hasMoreTokens()) {
	String word = wordTok.nextToken();
	
	// Store word in buffer
	for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
	  buffer[i] = buffer[i + 1];
	}
	buffer[m_MaxPhraseLength - 1] = word;
	
	// How many are buffered?
	numSeen++;
	if (numSeen > m_MaxPhraseLength) {
	  numSeen = m_MaxPhraseLength;
	}
	
	// Don't consider phrases that end with a stop word
	if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
	  continue;
	}
	
	// Loop through buffer and add phrases to hashtable
	StringBuffer phraseBuffer = new StringBuffer();
	for (int i = 1; i <= numSeen; i++) {
	  if (i > 1) {
	    phraseBuffer.insert(0, ' ');
	  }
	  phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
	  
	  // Don't consider phrases that begin with a stop word
	  if ((i > 1) && 
	      (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
	    continue;
	  }
	  
	  // Only consider phrases with minimum length
	  if (i >= m_MinPhraseLength) {
	    
	    // Stem string
	    String orig = phraseBuffer.toString();
	    String internal = internalFormat(orig);
	    Counter count = (Counter)hash.get(internal);
	    if (count == null) {
	      hash.put(internal, new Counter());
	    } else {
	      count.increment();
	    }
	    
	    // Add components if phrase is single word before
	    // conversion into internal format (i.e. error-correcting)
	    /*if ((orig.indexOf(' ') == -1) &&
	      (internal.indexOf(' ') != -1)) {
	      StringTokenizer tokW = new StringTokenizer(internal, " ");
	      while (tokW.hasMoreTokens()) {
	      String comp = (String)tokW.nextToken();
	      Counter countW = (Counter)hash.get(comp);
	      if (countW == null) {
	      hash.put(comp, new Counter());
	      } else {
	      countW.increment();
	      }
	      }
	      }*/
	  }
	}
      }
    }
    return hash;
  }

  /**
   * Expects an empty hashtable. Fills the hashtable
   * with the stemmed n-grams occuring in the given string
   * (as keys). Stores the position, the number of occurences,
   * and the most commonly occurring orgininal version of
   * each n-gram.
   *
   * N-grams that occur less than m_MinNumOccur are not used.
   *
   * Returns the total number of words (!) in the string.
   */
  private int getPhrases(HashMap hash, String str) {

    String[] buffer = new String[m_MaxPhraseLength];
    
    StringTokenizer tok = new StringTokenizer(str, "\n");
    int pos = 1; 
    while (tok.hasMoreTokens()) {
      String phrase = tok.nextToken();
      int numSeen = 0;
      StringTokenizer wordTok = new StringTokenizer(phrase, " ");
      while (wordTok.hasMoreTokens()) {
	String word = wordTok.nextToken();
	
	// Store word in buffer
	for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
	  buffer[i] = buffer[i + 1];
	}
	buffer[m_MaxPhraseLength - 1] = word;
	
	// How many are buffered?
	numSeen++;
	if (numSeen > m_MaxPhraseLength) {
	  numSeen = m_MaxPhraseLength;
	}
	
	// Don't consider phrases that end with a stop word
	if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
	  pos++;
	  continue;
	}
	
	// Loop through buffer and add phrases to hashtable
	StringBuffer phraseBuffer = new StringBuffer();
	for (int i = 1; i <= numSeen; i++) {
	  if (i > 1) {
	    phraseBuffer.insert(0, ' ');
	  }
	  phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
	  
	  // Don't consider phrases that begin with a stop word
	  if ((i > 1) && 
	      (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
	    continue;
	  }
	  
	  // Only consider phrases with minimum length
	  if (i >= m_MinPhraseLength) {
	    
	    // Stem string
	    String phrStr = phraseBuffer.toString();
	    String internal = internalFormat(phrStr);
	    FastVector vec = (FastVector)hash.get(internal);
	    if (vec == null) {
	      vec = new FastVector(3);
	      
	      // HashMap for storing all versions
	      HashMap secHash = new HashMap();
	      secHash.put(phrStr, new Counter());
	      
	      // Update hashtable with all the info
	      vec.addElement(new Counter(pos + 1 - i));
	      vec.addElement(new Counter());
	      vec.addElement(secHash);
	      hash.put(internal, vec);
	    } else {
	      
	      // Update number of occurrences
	      ((Counter)((FastVector)vec).elementAt(1)).increment();
	      
	      // Update hashtable storing different versions
	      HashMap secHash = (HashMap)vec.elementAt(2);
	      Counter count = (Counter)secHash.get(phrStr);
	      if (count == null) {
		secHash.put(phrStr, new Counter());
	      } else {
		count.increment();
	      }
	    }
	  }
	}
	pos++;
      }
    }
    
    // Replace secondary hashtables with most commonly occurring
    // version of each phrase (canonical) form. Delete all words
    // that are proper nouns.
    Iterator phrases = hash.keySet().iterator();
    while (phrases.hasNext()) {
      String phrase = (String)phrases.next();
      FastVector info = (FastVector)hash.get(phrase);

      // Occurring less than m_MinNumOccur?
      if (((Counter)((FastVector)info).elementAt(1)).value() < m_MinNumOccur) {
	phrases.remove();
	continue;
      }

      // Get canonical form
      String canForm = canonicalForm((HashMap)info.elementAt(2));
      if (canForm == null) {
	phrases.remove();
      } else {
	info.setElementAt(canForm, 2);
      }
    }
    return pos;
  }

  /**
   * Create canonical form of phrase.
   */
  private String canonicalForm(HashMap secHash) {

    int max = 0; String bestVersion = null;
    boolean allFullyHyphenated = true;
    int num = 0;
    Iterator versions = secHash.keySet().iterator();
    while (versions.hasNext()) {
      num++;
      String version = (String)versions.next();

      // Are all the words joined up?
      if (!isFullyHyphenated(version)) {
	allFullyHyphenated = false;
      }

      // Check for how often this version occurs
      Counter count = (Counter)secHash.get(version);
      if (count.value() > max) {
	max = count.value();
	bestVersion = version;
      }
    }
    if ((getCheckForProperNouns()) && (num == 1) && properNoun(bestVersion)) {
    //if (allProperNouns) {
      return null;
    } else {
      if (isFullyHyphenated(bestVersion) &&
	  !allFullyHyphenated) {
	bestVersion = bestVersion.replace('-', ' ');
      }
      return bestVersion;
    }
  }

  /**
   * Checks whether the given phrase is
   * fully hyphenated.
   */
  private boolean isFullyHyphenated(String str) {
    
    return (str.indexOf(' ') == -1);
  }

  /**
   * Checks whether the given string is a 
   * proper noun.
   *
   * @return true if it is a potential proper noun
   */
  private static boolean properNoun(String str) {

    // Is it more than one word?
    if (str.indexOf(' ') != -1) {
      return false;
    }

    // Does it start with an upper-case character?
    if (Character.isLowerCase(str.charAt(0))) {
      return false;
    } 

    // Is there at least one character that's
    // not upper-case?
    for (int i = 1; i < str.length(); i++) {
      /*if (Character.isUpperCase(str.charAt(i))) {
	return false;
	}*/
      if (!Character.isUpperCase(str.charAt(i))) {
	return true;
      }
    }
    //return true;
    return false;
  }

  /**
   * Generates the evaluation format of a phrase.
   */
  private String evalFormat(String str) {
    
    return m_Stemmer.stemString(str);
  }

  /** 
   * Generates the internal format of a phrase.
   */
  private String internalFormat(String str) {

    // Remove some non-alphanumeric characters
    str = str.replace('-', ' ');
    str = str.replace('/', ' ');
    str = str.replace('&', ' ');

    // Stem string
    return m_Stemmer.stemString(str);
  }

  /**
   * Gets all the phrases in the given string and puts them into the
   * hashtable.  Also stores the original version of the stemmed
   * phrase in the hash table.  
   */
  private HashMap getGivenKeyphrases(String str,
				       boolean forEval) {

    FastVector vector = new FastVector();
    HashMap hash = new HashMap();
    
    StringTokenizer tok = new StringTokenizer(str, "\n");
    while (tok.hasMoreTokens()) {
      String orig = tok.nextToken();
      orig = orig.trim();
      if (orig.length() > 0) {
	String modified;
	if (!forEval) {
	  modified = internalFormat(orig);
	} else {
	  modified = evalFormat(orig);
	}
	if (!hash.containsKey(modified)) {
	  hash.put(modified, orig);
	} else {
	  if (forEval) {
	    System.err.println("WARNING: stem of author-assigned keyphrase " +
			       orig + " matches existing stem (skipping it)!");
	  }
	}
      }
    }
    if (hash.size() == 0) {
      return null;
    } else {
      return hash;
    }
  }

  /**
   * Main method for testing this class.
   *
   * @param argv should contain arguments to the filter: use -h for help
   */
  public static void main(String [] argv) {
    
    try {
      if (Utils.getFlag('b', argv)) {
	Filter.batchFilterFile(new KEAFilter(), argv);
      } else {
	Filter.filterFile(new KEAFilter(), argv);
      }
    } catch (Exception ex) {
      System.out.println(ex.getMessage());
    }
  }
}