GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Keyphrase_Extraction

/*
 *    Kea.java
 * 
 *    Copyright (c) 1998-2005, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Valentin Tablan, 3/Feb/2005
 *
 *  $Id: Kea.java 6558 2005-02-03 17:28:52 +0000 (Thu, 03 Feb 2005) valyt $
 */
 
package gate.creole.kea;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import javax.swing.JFileChooser;
import javax.swing.JOptionPane;
import kea.KEAFilter;
import weka.core.*;
import gate.*;
import gate.creole.*;
import gate.gui.ActionsPublisher;
import gate.gui.MainFrame;
import gate.util.Err;
import gate.util.InvalidOffsetException;

/**
 * This is wrapper for using the KEA Keyphrase extractor
 * (<a href="http://www.nzdl.org/Kea/">http://www.nzdl.org/Kea/</a>)
 * within the GATE Language Engineering
 * architecture (<a href="http://gate.ac.uk">http://gate.ac.uk</a>).
 * It exposes KEA as a GATE Processing Resource that has two functioning modes:
 * <UL>
 *  <LI>Training mode: when keyphrases (marked as annotations on documents) are
 *  collected and a model is built.
 *  <LI>Application mode: when a built model is applied on documents to the end
 *  of extracting keyphrases.
 * </UL>
 */
public class Kea extends AbstractLanguageAnalyser implements ActionsPublisher{

  /**
   * Anonymous constructor, required by GATE. Does nothing.
   */
  public Kea() {
  }

  /**
   * Gets the list of GUI actions available from this PR. Currently Load and
   * Save model.
   * @return
   */
  public List getActions() {
    return actions;
  }

  /**
   * Executes this PR. Depeding on the state of the {@link #trainingMode} switch
   * it will either train a model or apply it over the documents.<br>
   * Trainig consists of collecting keyphrase annotations from the input
   * annotation set of the input documents. The first time a trained model is
   * required (either application mode has started or the model is being saved)
   * the actual model ({link @ #keaModel}) will be constructed.<br>
   * The application mode consists of using a trained model to generate
   * keyphrase annotations on the output annotation set of the input documents.
   * @throws ExecutionException
   */
  public void execute() throws gate.creole.ExecutionException {
    //reinitialise the KEA filter if already trained
    if(trainingMode.booleanValue() && trainingFinished){
      //retrainig started with a used model
      System.out.println("Reinitialising KEA model...");
      try{
        initModel();
      }catch(Exception e){
        throw new ExecutionException(e);
      }
    }

    //get the clear text from the document.
    String text = document.getContent().toString();
    //generate the first attribute: the text
    //this will be used for both training and application modes.
    double[] newInst = new double[2];
    newInst[0] = (double)data.attribute(0).addStringValue(text);

    if(trainingMode.booleanValue()){
      //training mode -> we need to collect the keyphrases
      //find the input annotation set.
      AnnotationSet annSet = inputAS == null || inputAS.length() == 0 ?
                             document.getAnnotations() :
                             document.getAnnotations(inputAS);
      //extract the keyphrase annotations
      AnnotationSet kpSet = annSet.get(keyphraseAnnotationType);
      if(kpSet != null && kpSet.size() > 0){
        //use a set to avoid repetitions
        Set keyPhrases = new HashSet();

        Iterator keyPhraseIter = kpSet.iterator();
        //initialise the string for the second attribute
        String keyPhrasesStr = "";
        while(keyPhraseIter.hasNext()){
          //get one keyphrase annotation
          Annotation aKeyPhrase = (Annotation)keyPhraseIter.next();
          try{
            //get the string for the keyphrase annotation
            String keyPhraseStr = document.getContent().
                                  getContent(aKeyPhrase.getStartNode().getOffset(),
                                             aKeyPhrase.getEndNode().getOffset()).
                                  toString();
            //if the keyphrase has not been seen before add to the string for
            //the second attribute
            if(keyPhrases.add(keyPhraseStr)) keyPhrasesStr +=
                                                keyPhrasesStr.length() > 0 ?
                                                "\n" + keyPhraseStr :
                                                keyPhraseStr;
          }catch(InvalidOffsetException ioe){
            throw new ExecutionException(ioe);
          }
        }
        //all the keyphrases have been enumerated -> create the second attribute
        newInst[1] = (double)data.attribute(1).addStringValue(keyPhrasesStr);
      }else{
        //no keyphrase annotations
        newInst[1] = Instance.missingValue();
        System.out.println("No keyphrases in document: " + document.getName());
      }
      //add the new instance to the dataset
      data.add(new Instance(1.0, newInst));
      try{
        keaFilter.input(data.instance(0));
      }catch(Exception e){
        throw new ExecutionException(e);
      }
      data = data.stringFreeStructure();
    }else{
      //application mode -> we need to generate keyphrases
      //build the model if not already done
      if(!trainingFinished) finishTraining();

      newInst[1] = Instance.missingValue();
      data.add(new Instance(1.0, newInst));
      try{
        keaFilter.input(data.instance(0));
      }catch(Exception e){
        throw new ExecutionException(e);
      }
      data = data.stringFreeStructure();
      //extract the output from the model
      Instance[] topRankedInstances = new Instance[phrasesToExtract.intValue()];
      Instance inst;
      while ((inst = keaFilter.output()) != null) {
        int index = (int)inst.value(keaFilter.getRankIndex()) - 1;
        if (index < phrasesToExtract.intValue()) {
          topRankedInstances[index] = inst;
        }
      }
      //annotate the document with the results -> create a list with all the
      //keyphrases found by KEA
      List phrases = new ArrayList();
      for(int i = 0; i < topRankedInstances.length; i ++){
        if(topRankedInstances[i] != null){
          phrases.add(topRankedInstances[i].
                      stringValue(keaFilter.getUnstemmedPhraseIndex()));
        }
      }
      try{
        //add the actiul annotations on the document
        annotateKeyPhrases(phrases);
      }catch(Exception e){
        throw new ExecutionException(e);
      }
    }//application mode
  }//execute

  /**
   * Annnotates the document with all the occurences of keyphrases from a List.
   * Uses the java.util.regex package to search for ocurences of keyphrases.
   * @param phrases the list of keyphrases.
   * @throws Exception
   */
  protected void annotateKeyPhrases(List phrases) throws Exception{
    if(phrases == null || phrases.isEmpty()) return;
    //create a pattern
    String patternStr = "";
    Iterator phraseIter = phrases.iterator();
    while(phraseIter.hasNext()){
      String phrase = (String)phraseIter.next();
      patternStr += patternStr.length() == 0 ?
                 "\\Q" + phrase + "\\E" :
                 "|\\Q" + phrase + "\\E";
    }

    Pattern pattern = Pattern.compile(patternStr,
                                      Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

    Matcher matcher = pattern.matcher(document.getContent().toString());
    //find the output annotation set
    AnnotationSet outputSet = outputAS == null || outputAS.length() == 0 ?
                              document.getAnnotations() :
                              document.getAnnotations(outputAS);
    while(matcher.find()){
      int start = matcher.start();
      int end = matcher.end();
      //add the new annotation
      outputSet.add(new Long(start), new Long(end), keyphraseAnnotationType,
                    Factory.newFeatureMap());
    }
    document.getFeatures().put("KEA matched keyphrases", phrases);
  }//protected void annotateKeyPhrases(List phrases)


  /**
   * Action used to save a trained model. If the model is not built yet it will
   * be built before being saved.
   */
  protected class SaveModelAction extends javax.swing.AbstractAction{
    public SaveModelAction(){
      super("Save model");
      putValue(SHORT_DESCRIPTION, "Saves the KEA model to a file");
    }

    public void actionPerformed(java.awt.event.ActionEvent evt){
      //we need to use a new thread to avoid blocking the GUI
      Runnable runnable = new Runnable(){
        public void run(){
          //get the file to save to
          JFileChooser fileChooser = MainFrame.getFileChooser();
          fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter());
          fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
          fileChooser.setMultiSelectionEnabled(false);
          if(fileChooser.showSaveDialog(null) == JFileChooser.APPROVE_OPTION){
            File file = fileChooser.getSelectedFile();
            ObjectOutputStream oos = null;
            try{
              MainFrame.lockGUI("Saving KEA model...");
              if(!trainingFinished) finishTraining();
              //zip and save the model
              oos = new ObjectOutputStream(new GZIPOutputStream(
                    new FileOutputStream(file.getCanonicalPath(), false)));
              oos.writeObject(keaFilter);
              oos.flush();
              oos.close();
              oos = null;
            }catch(Exception e){
              MainFrame.unlockGUI();
              JOptionPane.showMessageDialog(MainFrame.getInstance(),
                              "Error!\n"+
                               e.toString(),
                               "Gate", JOptionPane.ERROR_MESSAGE);
              e.printStackTrace(Err.getPrintWriter());
              if(oos != null) try{
                oos.close();
              }catch(IOException ioe){
                JOptionPane.showMessageDialog(MainFrame.getInstance(),
                                "Error!\n"+
                                 ioe.toString(),
                                 "Gate", JOptionPane.ERROR_MESSAGE);
                ioe.printStackTrace(Err.getPrintWriter());
              }
            }finally{
              MainFrame.unlockGUI();
            }
          }
        }
      };
      Thread thread = new Thread(runnable, "ModelSaver(serialisation)");
      thread.setPriority(Thread.MIN_PRIORITY);
      thread.start();
    }
  }

  /**
   * Action for loading a saved model. Once loaded the model wil be marked as
   * trained.
   */
  protected class LoadModelAction extends javax.swing.AbstractAction{
    public LoadModelAction(){
      super("Load model");
      putValue(SHORT_DESCRIPTION, "Loads a KEA model from a file");
    }

    public void actionPerformed(java.awt.event.ActionEvent evt){
      Runnable runnable = new Runnable(){
        public void run(){
          //get the file to load from.
          JFileChooser fileChooser = MainFrame.getFileChooser();
          fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter());
          fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
          fileChooser.setMultiSelectionEnabled(false);
          if(fileChooser.showOpenDialog(null) == JFileChooser.APPROVE_OPTION){
            File file = fileChooser.getSelectedFile();
            ObjectInputStream ois = null;
            try{
              MainFrame.lockGUI("Loading model...");
              //unzip and load the model
              ois = new ObjectInputStream(new GZIPInputStream(
                    new FileInputStream(file)));
              keaFilter = (KEAFilter)ois.readObject();
              ois.close();
              ois = null;
              //mark the model as trained.
              trainingFinished = true;
            }catch(Exception e){
              MainFrame.unlockGUI();
              JOptionPane.showMessageDialog(MainFrame.getInstance(),
                              "Error!\n"+
                               e.toString(),
                               "Gate", JOptionPane.ERROR_MESSAGE);
              e.printStackTrace(Err.getPrintWriter());
              if(ois != null) try{
                ois.close();
              }catch(IOException ioe){
                JOptionPane.showMessageDialog(MainFrame.getInstance(),
                                "Error!\n"+
                                 ioe.toString(),
                                 "Gate", JOptionPane.ERROR_MESSAGE);
                ioe.printStackTrace(Err.getPrintWriter());
              }
            }finally{
              MainFrame.unlockGUI();
            }
          }
        }
      };
      Thread thread = new Thread(runnable, "ModelLoader(serialisation)");
      thread.setPriority(Thread.MIN_PRIORITY);
      thread.start();
    }
  }

  /**
   * Sets the annotation type to be used for keyphrases.
   * @param keyphraseAnnotationType
   */
  public void setKeyphraseAnnotationType(String keyphraseAnnotationType) {
    this.keyphraseAnnotationType = keyphraseAnnotationType;
  }

  /**
   * Sets the annotation type to be used for keyphrases.
   * @return
   */
  public String getKeyphraseAnnotationType() {
    return keyphraseAnnotationType;
  }

  /**
   * Sets the name for the input annotation set.
   * @param inputAS
   */
  public void setInputAS(String inputAS) {
    this.inputAS = inputAS;
  }

  /**
   * Gets the name for the input annotation set.
   * @return
   */
  public String getInputAS() {
    return inputAS;
  }

  /**
   * Sets the name for the output annotation set.
   * @param outputAS
   */
  public void setOutputAS(String outputAS) {
    this.outputAS = outputAS;
  }

  /**
   * Gets the name for the output annotation set.
   * @return
   */
  public String getOutputAS() {
    return outputAS;
  }

  /**
   * Initialises this KEA Processing Resource.
   * @return
   * @throws ResourceInstantiationException
   */
  public Resource init() throws gate.creole.ResourceInstantiationException {
    System.out.println("\nThis is KEA (automatic keyphrase extraction)");
    System.out.println("Details at http://www.nzdl.org/Kea/\n");
    super.init();
    try{
      initModel();
    }catch(Exception e){
      throw new ResourceInstantiationException(e);
    }
    actions = new ArrayList();
    actions.add(new SaveModelAction());
    actions.add(new LoadModelAction());
    return this;
  }

  /**
   * Initialises the KEA model.
   * @throws Exception
   */
  protected void initModel()throws Exception{
    keaFilter = new KEAFilter();

    atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector)null));
    atts.addElement(new Attribute("keyphrases", (FastVector)null));
    data = new Instances("keyphrase_training_data", atts, 0);

    keaFilter.setDisallowInternalPeriods(getDisallowInternalPeriods().booleanValue());
    keaFilter.setKFused(getUseKFrequency().booleanValue());
    keaFilter.setMaxPhraseLength(getMaxPhraseLength().intValue());
    keaFilter.setMinPhraseLength(getMinPhraseLength().intValue());
    keaFilter.setMinNumOccur(getMinNumOccur().intValue());
    keaFilter.setInputFormat(data);
    trainingFinished = false;
  }

  /**
   * Stops the training phase and builds the actual model.
   * @throws ExecutionException
   */
  protected void finishTraining() throws ExecutionException{
    if(trainingFinished) return;
    try{
      keaFilter.batchFinished();
    }catch(Exception e){
      throw new ExecutionException(e);
    }
    // Get rid of instances in filter
    Instance dummy;
    while ((dummy = keaFilter.output()) != null) {};
    trainingFinished = true;
  }


  public void setMaxPhraseLength(Integer maxPhraseLength) {
    this.maxPhraseLength = maxPhraseLength;
  }
  public Integer getMaxPhraseLength() {
    return maxPhraseLength;
  }
  public void setMinPhraseLength(Integer minPhraseLength) {
    this.minPhraseLength = minPhraseLength;
  }
  public Integer getMinPhraseLength() {
    return minPhraseLength;
  }
  public void setDisallowInternalPeriods(Boolean dissallowInternalPeriods) {
    this.disallowInternalPeriods = dissallowInternalPeriods;
  }
  public Boolean getDisallowInternalPeriods() {
    return disallowInternalPeriods;
  }
  public void setUseKFrequency(Boolean useKFrequency) {
    this.useKFrequency = useKFrequency;
  }
  public Boolean getUseKFrequency() {
    return useKFrequency;
  }
  public void setMinNumOccur(Integer minNumOccur) {
    this.minNumOccur = minNumOccur;
  }
  public Integer getMinNumOccur() {
    return minNumOccur;
  }
  public Boolean getTrainingMode() {
    return trainingMode;
  }
  public void setTrainingMode(Boolean trainingMode) {
    this.trainingMode = trainingMode;
  }
  public void setPhrasesToExtract(Integer phrasesToExtract) {
    this.phrasesToExtract = phrasesToExtract;
  }
  public Integer getPhrasesToExtract() {
    return phrasesToExtract;
  }

  /**
   * If <tt>true</tt> then the PR is in training mode and will collect
   * keyphrases from the input documents.
   * If <tt>false</tt> then the PR is in application mode and will generate
   * keyphrases on the input documents.
   */
  private Boolean trainingMode;

  /**
   * The annotation type used for the keyphrase annotations.
   */
  private String keyphraseAnnotationType;

  /**
   * The name of the input annotation set.
   */
  private String inputAS;

  /**
   * The name for the output annotation set.
   */
  private String outputAS;

  /**
   * The maximum length for a keyphrase (default 3).
   */
  private Integer maxPhraseLength;

  /**
   * The minimum length for a keyphrase (default 1).
   */
  private Integer minPhraseLength;

  /**
   * Should periods be disallowed inside keyphrases?
   */
  private Boolean disallowInternalPeriods;

  /**
   * Use keyphrase frequency statistic.
   */
  private Boolean useKFrequency;

  /**
   * The minimum number of times a phrase needs to occur (default: 2).
   */
  private Integer minNumOccur;

  /**
   * How many keyphrases should be extracted for each input document?
   */
  private Integer phrasesToExtract;

  /**
   * This flag is used to determine whether the model has been constructed or
   * not. During training mode the training data is simply collected and this
   * flag is set to <tt>false</tt>. The first time when the traied model is
   * required (which could be either the first time the application mode is
   * started or when the model is being saved) the model is built from the
   * collected instances and this flag is set to <tt>true</tt>.<br>
   * If this flag is found to be <tt>true</tt> during training phase (i.e.
   * there is an attempt to train an already triend model) then the current
   * model will be discarded and a new one will be created. The traininig will
   * be performed using the newly created model.
   */
  protected boolean trainingFinished;

  /**
   * The KEA filter object which incorporates the actual model.
   */
  protected  KEAFilter keaFilter = null;

  /**
   * Data structure used internally to define the dataset.
   */
  protected FastVector atts;

  /**
   * The dataset.
   */
  protected Instances data;

  /**
   * The list of GUI actions available from this PR on popup menus.
   */
  protected List actions;
}