Log in Help
Print
Homereleasesgate-7.1-build4485-ALLpluginsTermRaidersrcgatetermraiderbank 〉 TfIdfTermbank.java
 
/*
 *  Copyright (c) 2008-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: TfIdfTermbank.java 16296 2012-11-20 12:17:11Z adamfunk $
 */
package gate.termraider.bank;

import gate.creole.metadata.*;
import gate.gui.ActionsPublisher;
import gate.*;
import gate.termraider.bank.modes.IdfCalculation;
import gate.termraider.bank.modes.TfCalculation;
import gate.termraider.util.*;
import java.util.*;



@CreoleResource(name = "TfIdfTermbank",
        icon = "termbank-lr.png",
        comment = "TermRaider Termbank derived from vectors in document features")

public class TfIdfTermbank extends AbstractTermbank
    implements ActionsPublisher  {

  private static final long serialVersionUID = -8275690376233755995L;
  
  /* EXTRA CREOLE PARAMETERS */
  private TfCalculation tfCalculation;
  private IdfCalculation idfCalculation;
  
  /* EXTRA DATA */
  private int documentCount;
  
  
  protected void addData(Document document) {
    documentCount++;
    String documentSource = Utilities.sourceOrName(document);
    AnnotationSet candidates = document.getAnnotations(inputASName).get(inputAnnotationTypes);

    for (Annotation candidate : candidates) {
      Term term = makeTerm(candidate, document);
      incrementTermFreq(term, 1);
      
      if (termDocuments.containsKey(term)) {
        termDocuments.get(term).add(documentSource);
      }
      else {
        Set<String> docNames = new HashSet<String>();
        docNames.add(documentSource);
        termDocuments.put(term, docNames);
      }
    }
  }

  
  protected void calculateScores() {
    for (Term term : termFrequencies.keySet()) {
      int tf = termFrequencies.get(term);
      int df = termDocuments.get(term).size();
      double score = TfCalculation.calculate(tfCalculation, tf) * IdfCalculation.calculate(idfCalculation, df, documentCount);
      rawTermScores.put(term, Double.valueOf(score));
      termScores.put(term, Utilities.normalizeScore(score));
    }
    
    termsByDescendingScore = new ArrayList<Term>(termScores.keySet());
    Collections.sort(termsByDescendingScore, new TermComparatorByDescendingScore(termScores));
    
    if (debugMode) {
      System.out.println("Termbank: nbr of terms = " + termsByDescendingScore.size());
    }
  }
  
  
  protected void resetScores() {
    termDocuments    = new HashMap<Term, Set<String>>();
    termScores       = new HashMap<Term, Double>();
    rawTermScores    = new HashMap<Term, Double>();
    termsByDescendingScore      = new ArrayList<Term>();
    termsByDescendingFrequency = new ArrayList<Term>();
    termsByDescendingDocFrequency = new ArrayList<Term>();
    termFrequencies = new HashMap<Term, Integer>();
    docFrequencies = new HashMap<Term, Integer>();
    documentCount = 0;
  }


  
  /***** CREOLE PARAMETERS *****/

  @CreoleParameter(comment = "term frequency calculation",
          defaultValue = "Logarithmic")
  public void setTfCalculation(TfCalculation mode) {
    this.tfCalculation = mode;
  }
  
  public TfCalculation getTfCalculation() {
    return this.tfCalculation;
  }
          

          
  @CreoleParameter(comment = "inverted document frequency calculation",
          defaultValue = "Logarithmic")
  public void setIdfCalculation(IdfCalculation mode) {
    this.idfCalculation = mode;
  }
  
  public IdfCalculation getIdfCalculation() {
    return this.idfCalculation;
  }

  
  /* override default value from AbstractTermbank   */
  @CreoleParameter(defaultValue = "tfIdf")
  public void setScoreProperty(String name) {
    super.setScoreProperty(name);
  }

}