Log in Help
Print
Homereleasesgate-7.1-build4485-ALLpluginsTermRaidersrcgatetermraiderbank 〉 AbstractTermbank.java
 
/*
 *  Copyright (c) 2008--2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: AbstractTermbank.java 16337 2012-11-27 14:42:40Z adamfunk $
 */
package gate.termraider.bank;

import gate.creole.*;
import gate.creole.metadata.*;
import gate.gui.ActionsPublisher;
import gate.util.*;
import gate.*;
import java.io.*;
import java.util.*;

import gate.termraider.output.*;
import gate.termraider.util.*;
import gate.termraider.gui.*;
import javax.swing.Action;



public abstract class AbstractTermbank extends AbstractBank 
    implements ActionsPublisher  {
  private static final long serialVersionUID = -2809051430169834059L;
  
  // CREOLE init parameters
  protected boolean debugMode;
  protected String inputASName;
  protected Set<String> inputAnnotationTypes;

  // transient to allow serialization
  protected transient List<Action> actionsList;
  
  protected Map<Term, Set<String>>  termDocuments;
  protected Map<Term, Double>       termScores;
  protected Map<Term, Double>       rawTermScores;
  protected List<Term> termsByDescendingScore, termsByDescendingFrequency,
    termsByDescendingDocFrequency;
  protected Map<Term, Integer>      termFrequencies, docFrequencies;

  public static final String freqProperty = "frequency";


  public Resource init() throws ResourceInstantiationException {
    prepare();
    resetScores();
    processCorpora();
    scanTypesLanguagesDocFreq();
    calculateScores();
    return this;
  }
  

  public void cleanup() {
    super.cleanup();
  }
  
  
  public List<Term> getTermsByDescendingScore() {
    return this.termsByDescendingScore;
  }
  
  public List<Term> getTermsByDescendingFrequency() {
    return this.termsByDescendingFrequency;
  }
  
  public List<Term> getTermsByDescendingDocFrequency() {
    return this.termsByDescendingDocFrequency;
  }
  
  public Map<Term, Double> getTermScores() {
    return this.termScores;
  }
  
  public Map<Term, Set<String>> getTermDocuments() {
    return this.termDocuments;
  }
  
  
  public Set<String> getDocumentsForTerm(Term term) {
    if (this.termDocuments.containsKey(term)) {
      return this.termDocuments.get(term);
    }
    
    // implied else: empty set
    return new HashSet<String>();
  }
  
  
  public Map<Term, Integer> getTermFrequencies() {
    return this.termFrequencies;
  }
  
  public Map<Term, Integer> getDocFrequencies() {
    return this.docFrequencies;
  }
  
  public String getFreqProperty() {
    return freqProperty;
  }

  public Double getMinScore() {
    if (this.termScores.isEmpty()) {
      return 1.0;
    }
    // implied else
    return Collections.min(this.termScores.values());
  }
  
  public Double getMaxScore() {
    if (this.termScores.isEmpty()) {
      return 1.0;
    }
    // implied else
    return Collections.max(this.termScores.values());
  }
  
  
  protected void prepare() throws ResourceInstantiationException {
    if ( (corpora == null) || (corpora.size() == 0) ) {
      throw new ResourceInstantiationException("No corpora given");
    }
  }
  
  protected void createActions() {
    actionsList = new ArrayList<Action>();
    actionsList.add(new ActionSaveCsv("Save as CSV...", this));
  }
  
  
  protected void processCorpora() {
    for (Corpus corpus : corpora) {
      processCorpus(corpus);
      if (debugMode) {
        System.out.println("Termbank: added corpus " + corpus.getName() + " with " + corpus.size() + " documents");
      }
    }
  }
  
  
  protected void processCorpus(Corpus corpus) {
    for (int i=0 ; i < corpus.size() ; i++) {
      boolean wasLoaded = corpus.isDocumentLoaded(i);
      Document document = (Document) corpus.get(i);
      
      addData(document);

      // datastore safety
      if (! wasLoaded) {
        corpus.unloadDocument(document);
        Factory.deleteResource(document);
      }
    }
  }
  
  
  private void scanTypesLanguagesDocFreq() {
    this.types = new TreeSet<String>();
    this.languages = new TreeSet<String>();
    for (Term term : this.termFrequencies.keySet()) {
      this.languages.add(term.getLanguageCode());
      this.types.add(term.getType());
      this.docFrequencies.put(term, termDocuments.get(term).size());
    }
  }
  

  /* BEHOLD THE GUBBINS to distinguish the various types of Termbanks*/

  /**
   * This method needs to call incrementTermFreq(...)!
   */
  protected abstract void addData(Document document);
  
  protected abstract void calculateScores(); 
  
  protected abstract void resetScores();
  

  
  
  protected int incrementTermFreq(Term term, int increment) {
    return incrementMap(termFrequencies, term, increment);
  }
  
  
  protected int incrementMap(Map<Term, Integer> map, Term key, int increment) {
    int count = 0;
    if (map.containsKey(key)) {
      count = map.get(key).intValue();
    }
    count += increment;
    map.put(key, Integer.valueOf(count));
    return count;
  }
  
  
  public Double getScore(Term term) {
    if (termScores.containsKey(term)) {
      return termScores.get(term).doubleValue();
    }
    
    // error code
    return null;
  }

  
  public Double getRawScore(Term term) {
    if (rawTermScores.containsKey(term)) {
      return rawTermScores.get(term).doubleValue();
    }
    
    // error code
    return null;
  }

  
  
  /* Methods for saving as CSV */
  
  public void saveAsCsv(double threshold, File outputFile) throws GateException {
    CsvGenerator generator = new CsvGenerator();
    generator.generateAndSaveCsv(this, threshold, outputFile);
  }

  /**
   * Convenience method to save everything in the termbank.
   * @param outputFile
   * @throws GateException
   */
  public void saveAsCsv(File outputFile) throws GateException {
    double threshold = this.getMinScore();
    CsvGenerator generator = new CsvGenerator();
    generator.generateAndSaveCsv(this, threshold, outputFile);
  }
  
  
  @Override
  public List<Action> getActions() {
    // lazy instantiation because actionsList is transient
    if (actionsList == null) {
      createActions();
    }
    
    return this.actionsList;
  }

  
  public int getTermFrequency(Term term) {
    if (termFrequencies.containsKey(term)) {
      return termFrequencies.get(term);
    }
    // implied else
    return 0;
  }
  
  
  /***** CREOLE PARAMETERS *****/

  @CreoleParameter(comment = "input AS name",
          defaultValue = "")
  public void setInputASName(String name) {
    this.inputASName = name;
  }
  public String getInputASName() {
    return this.inputASName;
  }
  
  
  @CreoleParameter(comment = "input annotation types",
          defaultValue = "SingleWord;MultiWord")
  public void setInputAnnotationTypes(Set<String> names) {
    this.inputAnnotationTypes = names;
  }
  
  public Set<String> getInputAnnotationTypes() {
    return this.inputAnnotationTypes;
  }
  
  
  @CreoleParameter(comment = "print debugging information during initialization",
          defaultValue = "false")
  public void setDebugMode(Boolean debug) {
    this.debugMode = debug;
  }

  public Boolean getDebugMode() {
    return this.debugMode;
  }
  

}