Log in Help
Print
Homereleasesgate-8.0-build4825-ALLpluginsTermRaidersrcgatetermraiderbank 〉 AnnotationTermbank.java
 
/*
 *  Copyright (c) 2008--2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: AnnotationTermbank.java 17968 2014-05-11 16:37:34Z ian_roberts $
 */
package gate.termraider.bank;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.FeatureMap;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.gui.ActionsPublisher;
import gate.termraider.modes.MergingMode;
import gate.termraider.modes.Normalization;
import gate.termraider.util.ScoreType;
import gate.termraider.util.Term;
import gate.termraider.util.Utilities;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;


@CreoleResource(name = "AnnotationTermbank",
    icon = "termbank-lr.png",
    comment = "TermRaider Termbank derived from document annotations",
    helpURL = "http://gate.ac.uk/userguide/sec:creole:termraider:annotation")
public class AnnotationTermbank extends AbstractTermbank
    implements ActionsPublisher  {

  private static final long serialVersionUID = -775090785732060049L;
  
  /* EXTRA CREOLE PARAMETERS */
  protected String inputScoreFeature;
  private MergingMode mergingMode;
  private Normalization normalization;

  /* EXTRA DATA FOR ANALYSIS */
  private Map<Term, List<Double>>  termIndividualScores;
  private ScoreType rawScoreST, termFrequencyST, localDocFrequencyST;
  
  
  protected void processDocument(Document document) {
    documentCount++;
    String documentSource = Utilities.sourceOrName(document);
    AnnotationSet candidates = document.getAnnotations(inputASName).get(inputAnnotationTypes);

    for (Annotation candidate : candidates) {
      Term term = makeTerm(candidate, document);
      FeatureMap fm = candidate.getFeatures();
      if (fm.containsKey(inputScoreFeature)) {
        Utilities.incrementScoreTermValue(scores, termFrequencyST, term, 1);
        
        double score = ((Number) fm.get(inputScoreFeature)).doubleValue();
        Utilities.addToMapSet(termDocuments, term, documentSource);
        
        if (termIndividualScores.containsKey(term)) {
          List<Double> scoreList = termIndividualScores.get(term);
          scoreList.add(score);
        }
        else {
          List<Double> scoreList = new ArrayList<Double>();
          scoreList.add(score);
          termIndividualScores.put(term, scoreList);
        }
      }
    }
  }


  public void calculateScores() {
    for (Term term : termDocuments.keySet()) {
      languages.add(term.getLanguageCode());
      types.add(term.getType());
      
      Double rawScore = MergingMode.calculate(mergingMode, termIndividualScores.get(term));
      Utilities.setScoreTermValue(scores, rawScoreST, term, rawScore);
      int localDF = termDocuments.get(term).size();
      Utilities.setScoreTermValue(scores, localDocFrequencyST, term, localDF);
      double normalized = Normalization.calculate(normalization, rawScore);
      Utilities.setScoreTermValue(scores, getDefaultScoreType(), term, normalized);
    }
    
    if (debugMode) {
      System.out.println("Termbank: nbr of terms = " + termDocuments.size());
    }
  }

  
  protected void resetScores() {
    scores = new HashMap<ScoreType, Map<Term,Number>>();
    for (ScoreType st : scoreTypes) {
      scores.put(st, new HashMap<Term, Number>());
    }
    termIndividualScores = new HashMap<Term, List<Double>>();
    termDocuments        = new HashMap<Term, Set<String>>();
    languages = new HashSet<String>();
    types = new HashSet<String>();
  }

  
  protected void initializeScoreTypes() {
    this.scoreTypes = new ArrayList<ScoreType>();
    this.scoreTypes.add(new ScoreType(scoreProperty));
    this.rawScoreST = new ScoreType(scoreProperty + AbstractTermbank.RAW_SUFFIX);
    this.scoreTypes.add(rawScoreST);
    this.termFrequencyST = new ScoreType("termFrequency");
    this.scoreTypes.add(termFrequencyST);
    this.localDocFrequencyST = new ScoreType("localDocFrequency");
    this.scoreTypes.add(localDocFrequencyST);
  }

  
  /***** CREOLE PARAMETERS *****/

  @CreoleParameter(comment = "annotation feature containing the score to index",
          defaultValue = "localAugTfIdf")
  public void setInputScoreFeature(String annScoreFeature) {
    this.inputScoreFeature = annScoreFeature;    
  }
  
  public String getInputScoreFeature() {
    return this.inputScoreFeature;
  }
  
  @CreoleParameter(comment = "method for aggregating local scores",
          defaultValue = "MAXIMUM")
  public void setMergingMode(MergingMode mode) {
    this.mergingMode = mode;
  }
  
  public MergingMode getMergingMode() {
    return this.mergingMode;
  }
  
  @CreoleParameter(comment = "score normalization",
          defaultValue = "Sigmoid")
  public void setNormalization(Normalization mode) {
    this.normalization = mode;
  }
  
  public Normalization getNormalization() {
    return this.normalization;
  }
  
  /* override default value from AbstractTermbank   */
  @CreoleParameter(defaultValue = "tfIdfAug")
  public void setScoreProperty(String name) {
    super.setScoreProperty(name);
  }


  @Override
  public Map<String, String> getMiscDataForGui() {
    Map<String, String> result = new HashMap<String, String>();
    result.put("nbr of local documents", String.valueOf(this.documentCount));
    result.put("nbr of terms", String.valueOf(this.getDefaultScores().size()));
    return result;
  }

}