Log in Help
Print
Homereleasesgate-8.4-build5748-ALLpluginsTermRaidersrcgatetermraiderbank 〉 TfIdfTermbank.java
 
/*
 *  Copyright (c) 2008-2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: TfIdfTermbank.java 19329 2016-05-19 13:40:29Z adamfunk $
 */
package gate.termraider.bank;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.gui.ActionsPublisher;
import gate.termraider.modes.IdfCalculation;
import gate.termraider.modes.Normalization;
import gate.termraider.modes.TfCalculation;
import gate.termraider.util.ScoreType;
import gate.termraider.util.Term;
import gate.termraider.util.Utilities;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;


@CreoleResource(name = "TfIdfTermbank",
        icon = "termbank-lr.png",
        comment = "TermRaider Termbank derived from vectors in document features",
        helpURL = "http://gate.ac.uk/userguide/sec:creole:termraider:tfidf")

public class TfIdfTermbank extends AbstractTermbank
    implements ActionsPublisher  {

  private static final long serialVersionUID = 2256964300070167978L;
  
  /* EXTRA CREOLE PARAMETERS */
  private TfCalculation tfCalculation;
  private IdfCalculation idfCalculation;
  private Normalization normalization;
  private DocumentFrequencyBank docFreqSource;
  
  /* EXTRA DATA */
  private ScoreType rawScoreST, termFrequencyST, localDocFrequencyST, refDocFrequencyST;
  
  
  
  protected void processDocument(Document document, int index) {
    documentCount++;
    String documentSource = Utilities.docIdentifier(document, idDocumentFeature, index);
    AnnotationSet candidates = document.getAnnotations(inputASName).get(inputAnnotationTypes);

    for (Annotation candidate : candidates) {
      Term term = makeTerm(candidate, document);
      Utilities.incrementScoreTermValue(scores, termFrequencyST, term, 1);
      Utilities.addToMapSet(termDocuments, term, documentSource);
    }
  }

  
  protected void initializeScoreTypes() {
    this.scoreTypes = new ArrayList<ScoreType>();
    this.scoreTypes.add(new ScoreType(scoreProperty));
    this.rawScoreST = new ScoreType(scoreProperty + AbstractTermbank.RAW_SUFFIX);
    this.scoreTypes.add(rawScoreST);
    this.termFrequencyST = new ScoreType("termFrequency");
    this.scoreTypes.add(termFrequencyST);
    this.localDocFrequencyST = new ScoreType("localDocFrequency");
    this.scoreTypes.add(localDocFrequencyST);
    this.refDocFrequencyST = new ScoreType("refDocFrequency");
    this.scoreTypes.add(refDocFrequencyST);
  }

  
  protected void calculateScores() {
    for (Term term : scores.get(termFrequencyST).keySet()) {
      this.languages.add(term.getLanguageCode());
      this.types.add(term.getType());
      
      int tf = scores.get(termFrequencyST).get(term).intValue();
      int df = docFreqSource.getFrequencyLax(term);
      Utilities.setScoreTermValue(scores, refDocFrequencyST, term, df);
      int localDF = this.termDocuments.get(term).size();
      Utilities.setScoreTermValue(scores, localDocFrequencyST, term, localDF);
      int n = docFreqSource.getDocumentCount();
      double rawScore = TfCalculation.calculate(tfCalculation, tf) * IdfCalculation.calculate(idfCalculation, df, n);
      Utilities.setScoreTermValue(scores, rawScoreST, term, rawScore);
      double normalized = Normalization.calculate(normalization, rawScore);
      Utilities.setScoreTermValue(scores, getDefaultScoreType(), term, normalized);
    }

    if (debugMode) {
      System.out.println("Termbank: nbr of terms = " + this.getTerms().size());
    }
  }
  
  
  protected void resetScores() {
    termDocuments = new HashMap<Term, Set<String>>();
    documentCount = 0;
    scores = new HashMap<ScoreType, Map<Term,Number>>();
    for (ScoreType st : scoreTypes) {
      scores.put(st, new HashMap<Term, Number>());
    }
    types = new HashSet<String>();
    languages = new HashSet<String>();
  }


  public int getDocCount() {
    return this.documentCount;
  }
  
  /***** CREOLE PARAMETERS *****/
  @Optional
  @CreoleParameter(comment = "document frequency bank (unset = create from these corpora)")
  public void setDocFreqSource(DocumentFrequencyBank dfb) {
    this.docFreqSource = dfb;
  }
  
  public DocumentFrequencyBank getDocFreqSource() {
    return this.docFreqSource;
  }
  
  @CreoleParameter(comment = "score normalization",
          defaultValue = "Sigmoid")
  public void setNormalization(Normalization mode) {
    this.normalization = mode;
  }
  
  public Normalization getNormalization() {
    return this.normalization;
  }
  

  @CreoleParameter(comment = "term frequency calculation",
          defaultValue = "Logarithmic")
  public void setTfCalculation(TfCalculation mode) {
    this.tfCalculation = mode;
  }
  
  public TfCalculation getTfCalculation() {
    return this.tfCalculation;
  }
          

          
  @CreoleParameter(comment = "inverted document frequency calculation",
          defaultValue = "LogarithmicScaled")
  public void setIdfCalculation(IdfCalculation mode) {
    this.idfCalculation = mode;
  }
  
  public IdfCalculation getIdfCalculation() {
    return this.idfCalculation;
  }

  
  /* override default value from AbstractTermbank   */
  @CreoleParameter(defaultValue = "tfIdf")
  public void setScoreProperty(String name) {
    super.setScoreProperty(name);
  }


  protected void prepare() throws ResourceInstantiationException {
    if ( (corpora == null) || (corpora.size() == 0) ) {
      throw new ResourceInstantiationException("No corpora given");
    }
    
    // If no DFB is specified, we create one from the given corpora
    if (this.docFreqSource == null) {
      FeatureMap dfbParameters = Factory.newFeatureMap();
      dfbParameters.put("inputASName", this.inputASName);
      dfbParameters.put("languageFeature", this.languageFeature);
      dfbParameters.put("inputAnnotationTypes", this.inputAnnotationTypes);
      dfbParameters.put("inputAnnotationFeature", this.inputAnnotationFeature);
      dfbParameters.put("corpora", this.corpora);
      dfbParameters.put("debugMode", this.debugMode);

      DocumentFrequencyBank dfb = (DocumentFrequencyBank) Factory.createResource(DocumentFrequencyBank.class.getName(), dfbParameters);
      this.setDocFreqSource(dfb);
    }
  }


  @Override
  public Map<String, String> getMiscDataForGui() {
    Map<String, String> result = new HashMap<String, String>();
    result.put("nbr of local documents", String.valueOf(this.documentCount));
    result.put("nbr of reference documents", String.valueOf(this.docFreqSource.getDocumentCount()));
    result.put("nbr of terms", String.valueOf(this.getDefaultScores().size()));
    return result;
  }

}