GATE.ac.uk - releases/gate-8.4-build5748-ALL/plugins/TermRaider/src/gate/termraider/bank/PMIBank.java

/*
 *  Copyright (c) 2008--2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: PMIBank.java 18970 2015-10-26 14:46:14Z adamfunk $
 */
package gate.termraider.bank;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.gui.ActionsPublisher;
import gate.termraider.util.Term;
import gate.termraider.util.UnorderedTermPair;
import gate.termraider.util.Utilities;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;



@CreoleResource(name = "PMI Bank",
        icon = "termbank-lr.png",
        comment = "Pointwise Mutual Information from corpora",
        helpURL = "http://gate.ac.uk/userguide/sec:creole:termraider:pmi")

public class PMIBank extends AbstractPairbank
    implements ActionsPublisher  {

  private static final long serialVersionUID = -5727540527631075420L;

  /* EXTRA CREOLE PARAMETERS */
  protected String outerAnnotationType;
  protected Set<String> innerAnnotationTypes;
  protected int outerAnnotationWindow;
  protected boolean requireTypeDifference;
  protected boolean allowOverlaps;

  
  private int totalCount, totalPairCount;
  private Set<Set<Integer>> seenCombinations;
  private Set<Integer> seen;
  private Map<Term, Set<String>> termDocuments;
  protected Map<Term, Integer> termCount;

  
  
  protected void addData(Document document, int index) {
    // TODO: add support for the doc ID feature
    String documentSource = Utilities.docIdentifier(document, null, index);
    /** Collocations that have already been processed in this document
     * (each collocation is a pair of IDs for a Token annotation), to avoid counting
     * them again.     */
    seenCombinations = new HashSet<Set<Integer>>();
    
    /** IDs of annotations (Tokens) that have already been processed in this document, to avoid
     * counting them again.     */
    seen = new HashSet<Integer>();
    
    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet innerAnnotations = inputAS.get(innerAnnotationTypes);
    List<Annotation> sentences = gate.Utils.inDocumentOrder(inputAS.get(outerAnnotationType));
    
    if (sentences.isEmpty()) {
      return;
    }
    
    if (sentences.size() <= outerAnnotationWindow) {
      processWindow(sentences, innerAnnotations, document, documentSource);
      return;
    }
    
    List<Annotation> window = new ArrayList<Annotation>();
    while (window.size() < outerAnnotationWindow) {
      window.add(sentences.remove(0));
    }
    
    while (! sentences.isEmpty()) {
      processWindow(window, innerAnnotations, document, documentSource);
      window.remove(0);
      window.add(sentences.remove(0));
    }
    processWindow(window, innerAnnotations, document, documentSource);
  }
  
  
  /** Scan a window of sentences and process each pair of terms in the window.
   * Skip any pair of instances that has already been processed (e.g., both terms are in
   * the same sentence and were processed already when that sentence was added
   * to the end of the window.  
   * @param window
   * @param inners (already restricted by types)
   * @param document
   * @param source
   */
  private void processWindow(List<Annotation> window, AnnotationSet inners, Document document, String source) {
    Long start = window.get(0).getStartNode().getOffset();
    Long end = window.get(window.size() - 1).getEndNode().getOffset();
    List<Annotation> terms = gate.Utils.inDocumentOrder(inners.getContained(start, end));
    for (int i = 0 ; i < (terms.size() - 1) ; i++)  {
      Annotation termI = terms.get(i);
      for (int j = i+1 ; j < terms.size() ; j++)  {
        Annotation termJ = terms.get(j);
        // compatibleTerms checks for overlaps too
        if ( combinationUnseen(termI, termJ) && compatibleTerms(termI, termJ) ) {
          processTerms(termI, termJ, document, source);
        }
      }
    }
  }

  
  /**
   * For a pair of annotations (terms), increment the term count if this instance (annotation)
   * of the term hasn't been counted already.  This method is called only if this
   * pair-instance hasn't been counted already.
   * @param ann0
   * @param ann1
   * @param document
   * @param source
   */
  private void processTerms(Annotation ann0, Annotation ann1, Document document, String source) {
    Term term0 = makeTerm(ann0, document);
    Term term1 = makeTerm(ann1, document);
    UnorderedTermPair pair = new UnorderedTermPair(term0, term1);
    
    if (unseen(ann0)) {
      incrementTermCount(term0, source);
      totalCount++;
    }
    
    if (unseen(ann1)) {
      incrementTermCount(term1, source);
      totalCount++;
    }
    
    incrementPairCount(pair, source);
    totalPairCount++;
  }
  
  
  private void incrementTermCount(Term term, String source) {
    int count;
    Set<String> sources;
    if (termCount.containsKey(term)) {
      count = termCount.get(term);
      sources = termDocuments.get(term);
    }
    else {
      count = 0;
      sources = new HashSet<String>();
    }
    count++;
    sources.add(source);
    termCount.put(term, count);
    termDocuments.put(term, sources);
  }

  
  private void incrementPairCount(UnorderedTermPair pair, String source) {
    int count;
    Set<String> sources;
    if (pairCount.containsKey(pair)) {
      count = pairCount.get(pair);
      sources = documents.get(pair);
    }
    else {
      count = 0;
      sources = new HashSet<String>();
    }
    count++;
    sources.add(source);
    pairCount.put(pair, count);
    documents.put(pair, sources);
  }
  
  
  private boolean combinationUnseen(Annotation a0, Annotation a1) {
    Set<Integer> combo = new HashSet<Integer>();
    combo.add(a0.getId());
    combo.add(a1.getId());
    if (seenCombinations.contains(combo)) {
      return false;
    }
    // implied else
    seenCombinations.add(combo);
    return true;
  }
  
  
  /** Check whether two annotations are compatible as collocations,
   * depending on requireTypeDifference and their types
   * and on allowOverlaps and their spans.
   * @param a0
   * @param a1
   * @return
   */
  
  private boolean compatibleTerms(Annotation a0, Annotation a1) {
    if (this.requireTypeDifference && a0.getType().equals(a1.getType())) {
      return false;
    }
    
    // implied else: types are compatible
    if (this.allowOverlaps) {
      return true;
    }

    // implied else: types are compatible but must check overlaps
    return ! a0.overlaps(a1);
  }
  
  
  
  private boolean unseen(Annotation anno) {
    if (seen.contains(anno.getId())) {
      return false;
    }
    // implied else
    seen.add(anno.getId());
    return true;
  }
  
  
  public int getTotalCount() {
    return this.totalCount;
  }
  
  public int getTotalPairCount() {
    return this.totalPairCount;
  }
  
  public int getNbrDistinctTerms() {
    return this.termCount.size();
  }
  
  public int getNbrDistinctPairs() {
    return this.pairCount.size();
  }
  
  public Set<Term> getTerms() {
    return this.termCount.keySet();
  }
  
  public int getDocumentCount(Term term) {
    if (this.termDocuments.containsKey(term)) {
      return this.termDocuments.get(term).size();
    }
    
    return 0;
  }

  public int getTermCount(Term term) {
    if (this.termCount.containsKey(term)) {
      return this.termCount.get(term);
    }
    // implied else
    return 0;
  }
  


  
  
  protected void resetImplScores() {
    termCount = new HashMap<Term, Integer>();
    termDocuments = new HashMap<Term, Set<String>>();
    totalCount = 0;
    totalPairCount = 0;
  }



  public void calculateScores() {
    double totalCountF = (double) totalCount;
    double totalPairCountF = (double) totalPairCount;
    Map<Term, Double> termProb = new HashMap<Term, Double>();
    for (Term term : termCount.keySet()) {
      double prob = ((double) termCount.get(term)) / totalCountF;
      termProb.put(term, prob);
    }
    
    for (UnorderedTermPair pair : this.pairCount.keySet()) {
      double px = termProb.get(pair.getTerm0());
      double py = termProb.get(pair.getTerm1());
      double pxy = ((double) pairCount.get(pair)) / totalPairCountF;
      
      /*  Notes
       * 
       * PMI not normalized (source?):
       * 
       * pmi(x,y) = log_2 ( P(x,y) / (P(x) * P(y)) )
       * 
       * normalized: -1 = never; 0 = independent; +1 = always coöccurring
       * 
       * npmi(x,y) = pmi(x,y) / (- log(P(x,y)))
       * 
       * npmi(x,y) = log(P(x)P(y)) / log(P(x,y)) - 1 
       * 
       */
      
      double npmi = Utilities.log2(px * py) / Utilities.log2(pxy) - 1;
      this.scores.put(pair, npmi * 100.0);
    }

    if (debugMode) {
      System.out.println("Pairbank: nbr of terms = " + termCount.keySet().size());
      System.out.println("Pairbank: nbr of pairs = " + pairCount.keySet().size());
    }
  }

  
  /***** CREOLE PARAMETERS *****/

  @CreoleParameter(comment = "annotation types to evaluate as terms",
          defaultValue = "Entity")
  public void setInnerAnnotationTypes(Set<String> types) {
    this.innerAnnotationTypes = types;
  }
  
  public Set<String> getInnerAnnotationTypes() {
    return this.innerAnnotationTypes;
  }


  @CreoleParameter(comment = "annotation type for scanning window",
          defaultValue = "Sentence")
  public void setOuterAnnotationType(String type) {
    this.outerAnnotationType = type;
  }
  
  public String getOuterAnnotationType() {
    return this.outerAnnotationType;
  }

  
  @CreoleParameter(comment = "window size in outer annotations",
          defaultValue = "2")
  public void setOuterAnnotationWindow(Integer w) {
    this.outerAnnotationWindow = w;
  }
  
  public Integer getOuterAnnotationWindow() {
    return this.outerAnnotationWindow;
  }

  
  @CreoleParameter(comment = "require each collocation pair to consist of different types",
          defaultValue = "false")
  public void setRequireTypeDifference(Boolean rtd) {
    this.requireTypeDifference = rtd;
  }
  
  public Boolean getRequireTypeDifference() {
    return this.requireTypeDifference;
  }

  
  @CreoleParameter(comment = "allow a collocation pair to consist of overlapping annotations",
          defaultValue = "false")
  public void setAllowOverlapCollocations(Boolean aoc) {
    this.allowOverlaps = aoc;
  }
  
  public Boolean getAllowOverlapCollocations() {
    return this.allowOverlaps;
  }
  
  
  /* override default value from AbstractPairbank   */
  @CreoleParameter(defaultValue = "pmiScore")
  public void setScoreProperty(String name) {
    super.setScoreProperty(name);
  }


}