Log in Help
Print
Homereleasesgate-8.0-build4825-ALLpluginsTermRaidersrcgatetermraiderbank 〉 DocumentFrequencyBank.java
 
/*
 *  Copyright (c) 2008-2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: DocumentFrequencyBank.java 17968 2014-05-11 16:37:34Z ian_roberts $
 */
package gate.termraider.bank;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Corpus;
import gate.Document;
import gate.Resource;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.gui.ActionsPublisher;
import gate.termraider.gui.ActionSaveCsv;
import gate.termraider.util.ScoreType;
import gate.termraider.util.Term;
import gate.termraider.util.Utilities;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.swing.Action;
import org.apache.commons.lang.StringEscapeUtils;


@CreoleResource(name = "DocumentFrequencyBank",
icon = "termbank-lr.png",
comment = "Document frequency counter derived from corpora and other DFBs",
helpURL = "http://gate.ac.uk/userguide/sec:creole:termraider:docfrequency")
public class DocumentFrequencyBank extends AbstractTermbank
implements ActionsPublisher{
  
  private static final long serialVersionUID = 8486379203429800194L;

  // Note: corpora parameter inherited from AbstractBank
  private Set<DocumentFrequencyBank> inputBanks;
  
  private Map<String, Set<Term>> stringLookupTable;

  // transient to allow serialization
  protected transient List<Action> actionsList;


  public Resource init() throws ResourceInstantiationException {
    prepare();
    initializeScoreTypes();
    resetScores();
    processInputBanks();
    processCorpora();
    calculateScores();
    return this;
  }
  

  public void cleanup() {
    super.cleanup();
  }
  
  
  
  protected void prepare() throws ResourceInstantiationException {
    if (corpora == null) {
      corpora = new HashSet<Corpus>();
    }
    if (inputBanks == null) {
      inputBanks = new HashSet<DocumentFrequencyBank>();
    }
  }

  
  protected void resetScores() {
    scores = new HashMap<ScoreType, Map<Term,Number>>();
    for (ScoreType st : scoreTypes) {
      scores.put(st, new HashMap<Term, Number>());
    }
    
    documentCount = 0;
    languages = new HashSet<String>();
    types = new HashSet<String>();
    stringLookupTable = new HashMap<String, Set<Term>>();
    termDocuments = new HashMap<Term, Set<String>>();
  }

  
  protected void createActions() {
    actionsList = new ArrayList<Action>();
    actionsList.add(new ActionSaveCsv("Save as CSV...", this));
  }
  
  
  protected void processCorpora() {
    for (Corpus corpus : corpora) {
      processCorpus(corpus);
      if (debugMode) {
        System.out.println("Termbank: added corpus " + corpus.getName() + " with " + corpus.size() + " documents");
      }
    }
  }
  
  
  protected void processInputBanks() {
    for (DocumentFrequencyBank bank : inputBanks) {
      this.documentCount += bank.documentCount;
      for (Term term : bank.getTerms()) {
        Utilities.incrementMap(getDefaultScores(), term, bank.getFrequencyStrict(term));
      }
    }
  }
  
  
  protected void processDocument(Document document) {
    documentCount++;
    String documentSource = Utilities.sourceOrName(document);
    AnnotationSet candidates = document.getAnnotations(inputASName).get(inputAnnotationTypes);

    Set<Term> documentTerms = new HashSet<Term>();
    for (Annotation candidate : candidates) {
      documentTerms.add(makeTerm(candidate, document));
    }
    
    for (Term term : documentTerms) {
      Utilities.addToMapSet(termDocuments, term, documentSource);
    }
  }

  
  protected void calculateScores() {
    for (Term term : termDocuments.keySet()) {
      this.types.add(term.getType());
      this.languages.add(term.getLanguageCode());
      int df = termDocuments.get(term).size();
      Utilities.setScoreTermValue(scores, getDefaultScoreType(), term, df);
      storeStringLookup(term);
    }

    if (debugMode) {
      System.out.println("Termbank: nbr of terms = " + this.getTerms().size());
    }
  }
  
  
  public int getFrequencyStrict(Term term) {
    if (getDefaultScores().containsKey(term)) {
      return getDefaultScores().get(term).intValue();
    }
    
    return 0;
  }
  
  
  public int getFrequencyLax(Term term) {
    // Try for an exact match first
    if (getDefaultScores().containsKey(term)) {
      return getDefaultScores().get(term).intValue();
    }
    
    // Now see if there's one with a blank language code
    String termString = term.getTermString();
    if (stringLookupTable.containsKey(termString)) {
      for (Term testTerm : stringLookupTable.get(termString)) {
        if (testTerm.closeMatch(term)) {
          return getDefaultScores().get(testTerm).intValue();
        }
      }
    }
    
    return 0;
  }
  
  
  @CreoleParameter(comment = "Other DFBs to compile into the new one")
  public void setInputBanks(Set<DocumentFrequencyBank> inputBanks) {
    this.inputBanks = inputBanks;
  }
  
  public Set<DocumentFrequencyBank> getInputBanks() {
    return this.inputBanks;
  }


  @Override
  public List<Action> getActions() {
    // lazy instantiation because actionsList is transient
    if (actionsList == null) {
      createActions();
    }
    
    return this.actionsList;
  }


  private void storeStringLookup(Term term) {
    String termString = term.getTermString();
    Set<Term> terms;
    if (stringLookupTable.containsKey(termString)) {
      terms = stringLookupTable.get(termString);
    }
    else {
      terms = new HashSet<Term>();
    }
    terms.add(term);
    stringLookupTable.put(termString, terms);
  }
  

  protected void initializeScoreTypes() {
    // Whatever this is called, it must be the reference
    // document frequency, so we will only need
    // to use getDefaultScoreType() later
    this.scoreTypes = new ArrayList<ScoreType>();
    this.scoreTypes.add(new ScoreType(scoreProperty));
  }

  
  @CreoleParameter(comment = "name of main score",
          defaultValue = "documentFrequency")
  public void setScoreProperty(String name) {
    this.scoreProperty = name;
  }

  @Override
  public Map<String, String> getMiscDataForGui() {
    Map<String, String> result = new HashMap<String, String>();
    result.put("nbr of documents", String.valueOf(this.documentCount));
    result.put("nbr of terms", String.valueOf(this.getDefaultScores().size()));
    result.put("nbr of distinct term strings", String.valueOf(this.stringLookupTable.size()));
    return result;
  }


  public String getCsvSubheader() {
    StringBuilder sb = new StringBuilder();
    sb.append('\n');
    sb.append(',').append(StringEscapeUtils.escapeCsv("_TOTAL_DOCS_"));
    sb.append(',').append(StringEscapeUtils.escapeCsv(""));
    sb.append(',').append(StringEscapeUtils.escapeCsv(""));
    sb.append(',').append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocumentCount())));
    return sb.toString();
  }
}