Log in Help
Print
Homegatesrcgateutil 〉 OntologyMeasures.java
 
/**
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 * Thomas Heitz - 09/06/2010
 *
 *  $Id$
 */

package gate.util;

import gate.Annotation;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.text.NumberFormat;
import java.util.*;

/**
 * Modified version of Precision and Recall called BDM that takes into
 * account the distance of two concepts in an ontology.
 */
public class OntologyMeasures {

  public OntologyMeasures() {
    // empty constructor
  }

  /**
   * Constructor to be used when you have a collection of OntologyMeasures
   * and want to consider it as only one OntologyMeasures.
   * Then you can only use the methods getPrecision/Recall/FMeasure...().
   * @param measures collection to be regrouped in one OntologyMeasures
   */
  public OntologyMeasures(Collection<OntologyMeasures> measures) {
    Map<String, List<AnnotationDiffer>> differsByTypeMap =
      new HashMap<String, List<AnnotationDiffer>>();
    for (OntologyMeasures measure : measures) {
      for (Map.Entry<String, Float> entry : measure.bdmByTypeMap.entrySet()) {
        float previousBdm = 0;
        if (bdmByTypeMap.containsKey(entry.getKey())) {
          previousBdm = bdmByTypeMap.get(entry.getKey());
        }
        // set the bdmByTypeMap to be the sum of those in the collection
        bdmByTypeMap.put(entry.getKey(), previousBdm + entry.getValue());
      }
      for (Map.Entry<String, AnnotationDiffer> entry :
             measure.differByTypeMap.entrySet()) {
        List<AnnotationDiffer> differs = differsByTypeMap.get(entry.getKey());
        if (differs == null) {
          differs = new ArrayList<AnnotationDiffer>();
        }
        differs.add(entry.getValue());
        differsByTypeMap.put(entry.getKey(), differs);
      }
    }
    // combine the list of AnnotationDiffer for each type
    for (Map.Entry<String, List<AnnotationDiffer>> entry :
           differsByTypeMap.entrySet()) {
      differByTypeMap.put(entry.getKey(),
        new AnnotationDiffer(entry.getValue()));
    }
  }

  /**
   * For a document get the annotation differs that contain the type to compare
   * and the annotation differs that may have miscategorized annotations
   * for this type. Then we try to find miscategorized types that are close
   * enough from the main type and use their BDM value to get an augmented
   * precision, recall and fscore.
   *
   * @param differs annotation differ for the type and for possible
   * miscategorized types.
   */
  public void calculateBdm(Collection<AnnotationDiffer> differs) {

    if (bdmByConceptsMap == null) {
      // load BDM file with scores for each concept/annotation type pair
      bdmByConceptsMap = read(bdmFileUrl); // read the bdm scores
    }

    // calculate BDM from the spurious and missing annotations
    Set<Annotation> unpairedResponseAnnotations = new HashSet<Annotation>();
    Set<Annotation> unpairedKeyAnnotations;

    // will use the whole spurious annotations as the second set to compare
    for (AnnotationDiffer differ : differs) {
      unpairedResponseAnnotations.addAll(
        differ.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE));
    }

    bdmByTypeMap.clear();

    for (AnnotationDiffer differ : differs) {
      unpairedKeyAnnotations = differ.getAnnotationsOfType(
        AnnotationDiffer.MISSING_TYPE);
      if (!bdmByTypeMap.containsKey(differ.getAnnotationType())) {
        bdmByTypeMap.put(differ.getAnnotationType(), 0f);
      }

      // use the missing annotations as the first set to compare
      for (Annotation unpairedKeyAnnotation : unpairedKeyAnnotations) {
        String type = unpairedKeyAnnotation.getType();
//        Out.prln("unpairedKeyAnnotation: " + unpairedKeyAnnotation.toString());
        Iterator<Annotation> iterator = unpairedResponseAnnotations.iterator();

        // use the spurious annotations as the second set to compare
        while (iterator.hasNext()) {
          Annotation unpairedResponseAnnotation = iterator.next();
//          Out.prln("unpairedResponsAnnotation: "
//            + unpairedResponseAnnotation.toString());
          float bdm = 0;

          // annotations have the same start and end offsets
          if (unpairedKeyAnnotation.coextensive(unpairedResponseAnnotation)) {

            // compare both features values with BDM pairs
            if (differ.getSignificantFeaturesSet() != null) {
              if (!type.equals(unpairedResponseAnnotation.getType())) {
                continue; // types must be the same
              }
              for (Object feature : differ.getSignificantFeaturesSet()) {
                if (unpairedKeyAnnotation.getFeatures() == null
                 || unpairedResponseAnnotation.getFeatures() == null) {
                  continue;
                }
//                Out.prln("Feature: " + feature);
                String keyLabel = (String)
                  unpairedKeyAnnotation.getFeatures().get(feature);
//                Out.prln("KeyLabel: " + keyLabel);
                String responseLabel = (String)
                  unpairedResponseAnnotation.getFeatures().get(feature);
//                Out.prln("ResponseLabel: " + responseLabel);
                if (keyLabel == null || responseLabel == null) {
                  // do nothing
                } else if (bdmByConceptsMap.containsKey(
                                              keyLabel + ", " + responseLabel)) {
                  bdm += bdmByConceptsMap.get(keyLabel + ", " + responseLabel);
                } else if (bdmByConceptsMap.containsKey(
                                              responseLabel + ", " + keyLabel)) {
                  bdm += bdmByConceptsMap.get(responseLabel + ", " + keyLabel);
                }
              }
              bdm = bdm / differ.getSignificantFeaturesSet().size();

            } else { // compare both types with BDM pairs
              if (bdmByConceptsMap.containsKey(
                    type + ',' + unpairedResponseAnnotation.getType())) {
                bdm = bdmByConceptsMap.get(
                    type + ',' + unpairedResponseAnnotation.getType());
              } else if (bdmByConceptsMap.containsKey(
                           unpairedResponseAnnotation.getType() + ", " + type)) {
                bdm = bdmByConceptsMap.get(
                           unpairedResponseAnnotation.getType() + ", " + type);
              }
            }
            if (bdm > 0) {
              bdmByTypeMap.put(type, bdmByTypeMap.get(type) + bdm);
              iterator.remove();
//              Out.prln("BDM: " + bdmByTypeMap.get(type));
            }
          }
        }
      }
    }

    differByTypeMap.clear();
    Map<String, List<AnnotationDiffer>> differsByTypeMap =
      new HashMap<String, List<AnnotationDiffer>>();

    for (AnnotationDiffer differ : differs) {
      // we consider that all annotations in AnnotationDiffer are the same type
      String type = differ.getAnnotationType();
      List<AnnotationDiffer> differsType = differsByTypeMap.get(type);
      if (differsType == null) {
        differsType = new ArrayList<AnnotationDiffer>();
      }
      differsType.add(differ);
      differsByTypeMap.put(type, differsType);
    }

    // combine the list of AnnotationDiffer for each type
    for (Map.Entry<String, List<AnnotationDiffer>> entry :
          differsByTypeMap.entrySet()) {
      differByTypeMap.put(entry.getKey(),
        new AnnotationDiffer(entry.getValue()));
    }
  }

  /**
   * AP = (sum of BDMs for BDM-matching pair spurious/missing + Correct)
   *    / (Correct + Spurious)
   * @param type annotation type
   * @return strict precision with BDM correction
   */
  public double getPrecisionStrictBdm(String type) {
    AnnotationDiffer differ = differByTypeMap.get(type);
    if (differ.getCorrectMatches() + differ.getSpurious() == 0) {
      return 1.0;
    }
    return (bdmByTypeMap.get(type) + differ.getCorrectMatches())
         / (differ.getCorrectMatches() + differ.getSpurious());
  }

  public double getPrecisionStrictBdm() {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getPrecisionStrictBdm(type);
    }
    return result / differByTypeMap.size();
  }

  public double getRecallStrictBdm(String type) {
    AnnotationDiffer differ = differByTypeMap.get(type);
    if (differ.getCorrectMatches() + differ.getMissing() == 0) {
      return 1.0;
    }
    return (bdmByTypeMap.get(type) + differ.getCorrectMatches())
         / (differ.getCorrectMatches() + differ.getMissing());
  }

  public double getRecallStrictBdm() {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getRecallStrictBdm(type);
    }
    return result / differByTypeMap.size();
  }

  public double getFMeasureStrictBdm(String type, double beta) {
    double precision = getPrecisionStrictBdm(type);
    double recall = getRecallStrictBdm(type);
    double betaSq = beta * beta;
    double answer = ((betaSq + 1) * precision * recall)
                  / (betaSq * precision + recall);
    if(Double.isNaN(answer)) answer = 0.0;
    return answer;
  }

  public double getFMeasureStrictBdm(double beta) {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getFMeasureStrictBdm(type, beta);
    }
    return result / differByTypeMap.size();
  }

  public double getPrecisionLenientBdm(String type) {
    AnnotationDiffer differ = differByTypeMap.get(type);
    if (differ.getCorrectMatches() + differ.getSpurious() == 0) {
      return 1.0;
    }
    return (bdmByTypeMap.get(type) + differ.getCorrectMatches()
          + differ.getPartiallyCorrectMatches())
         / (differ.getCorrectMatches() + differ.getSpurious());
  }

  public double getPrecisionLenientBdm() {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getPrecisionLenientBdm(type);
    }
    return result / differByTypeMap.size();
  }

  public double getRecallLenientBdm(String type) {
    AnnotationDiffer differ = differByTypeMap.get(type);
    if (differ.getCorrectMatches() + differ.getMissing() == 0) {
      return 1.0;
    }
    return (bdmByTypeMap.get(type) + differ.getCorrectMatches()
          + differ.getPartiallyCorrectMatches())
         / (differ.getCorrectMatches() + differ.getMissing());
  }

  public double getRecallLenientBdm() {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getRecallLenientBdm(type);
    }
    return result / differByTypeMap.size();
  }

  public double getFMeasureLenientBdm(String type, double beta) {
    double precision = getPrecisionLenientBdm(type);
    double recall = getRecallLenientBdm(type);
    double betaSq = beta * beta;
    double answer = ((betaSq + 1) * precision * recall)
                  / (betaSq * precision + recall);
    if(Double.isNaN(answer)) answer = 0.0;
    return answer;
  }

  public double getFMeasureLenientBdm(double beta) {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getFMeasureLenientBdm(type, beta);
    }
    return result / differByTypeMap.size();
  }

  public double getPrecisionAverageBdm(String type) {
    return (getPrecisionLenientBdm(type) + getPrecisionStrictBdm(type)) / 2.0;
  }

  /**
   * Gets the average of the strict and lenient precision values.
   * @return a <tt>double</tt> value.
   */
  public double getPrecisionAverageBdm() {
    return (getPrecisionLenientBdm() + getPrecisionStrictBdm()) / 2.0;
  }

  public double getRecallAverageBdm(String type) {
    return (getRecallLenientBdm(type) + getRecallStrictBdm(type)) / 2.0;
  }

  /**
   * Gets the average of the strict and lenient recall values.
   * @return a <tt>double</tt> value.
   */
  public double getRecallAverageBdm() {
    return (getRecallLenientBdm() + getRecallStrictBdm()) / 2.0;
  }

  public double getFMeasureAverageBdm(String type, double beta) {
    return (getFMeasureLenientBdm(type, beta)
          + getFMeasureStrictBdm(type, beta))
          / 2.0;
  }

  /**
   * Gets the average of strict and lenient F-Measure values.
   * @param beta The relative weight of precision and recall. A value of 1
   * gives equal weights to precision and recall. A value of 0 takes the recall
   * value completely out of the equation.
   * @return a <tt>double</tt>value.
   */
  public double getFMeasureAverageBdm(double beta) {
    return (getFMeasureLenientBdm(beta) + getFMeasureStrictBdm(beta)) / 2.0;
  }

  public void setBdmFile(URL url) {
    bdmFileUrl = url;
    bdmByConceptsMap = null;
  }

  /**
   * Read the BDM scores from a file.
   * @param bdmFile URL of the BDM file
   * @return map from a pair of concepts to their BDM score
   */
  public Map<String, Float> read(URL bdmFile) {
    Map<String, Float> bdmByConceptsMap = new HashMap<String, Float>();
    if (bdmFile == null) {
      Out.prln("There is no BDM file specified.");
      return bdmByConceptsMap;
    }
    BufferedReader bdmResultsReader = null;
    try {
      bdmResultsReader = new BomStrippingInputStreamReader(
        new FileInputStream(Files.fileFromURL(bdmFile)), "UTF-8");
      bdmResultsReader.readLine(); // skip the first line as the header
      String line = bdmResultsReader.readLine();
      while (line != null) {
        String[] terms = line.split(", ");
        if (terms.length > 3) {
          String oneCon = terms[0].substring(4);
          String anoCon = terms[1].substring(9);
          String bdmS = terms[2].substring(4);
          bdmByConceptsMap.put(oneCon + ", " + anoCon, new Float(bdmS));
        } else {
          Out.prln("File " + bdmFile.toString() + " has incorrect format" +
            "for the line [" + line + "].");
        }
        line = bdmResultsReader.readLine();
      }

    } catch(Exception e) {
      Out.prln("There is something wrong with the BDM file.");
      e.printStackTrace();

    } finally {
      if (bdmResultsReader != null) {
        try {
          bdmResultsReader.close();
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
    }
    return bdmByConceptsMap;
  }

  public List<String> getMeasuresRow(Object[] measures, String title) {
    List<AnnotationDiffer> differs = new ArrayList<AnnotationDiffer>(
      getDifferByTypeMap().values());
    AnnotationDiffer differ = new AnnotationDiffer(differs);
    NumberFormat f = NumberFormat.getInstance(Locale.ENGLISH);
    f.setMaximumFractionDigits(2);
    f.setMinimumFractionDigits(2);
    List<String> row = new ArrayList<String>();
    row.add(title);
    row.add(Integer.toString(differ.getCorrectMatches()));
    row.add(Integer.toString(differ.getMissing()));
    row.add(Integer.toString(differ.getSpurious()));
    row.add(Integer.toString(differ.getPartiallyCorrectMatches()));
    for (Object object : measures) {
      String measure = (String) object;
      double beta = Double.valueOf(
        measure.substring(1,measure.indexOf('-')));
      if (measure.endsWith("strict")) {
        row.add(f.format(differ.getPrecisionStrict()));
        row.add(f.format(differ.getRecallStrict()));
        row.add(f.format(differ.getFMeasureStrict(beta)));
      } else if (measure.endsWith("strict BDM")) {
        row.add(f.format(getPrecisionStrictBdm()));
        row.add(f.format(getRecallStrictBdm()));
        row.add(f.format(getFMeasureStrictBdm(beta)));
      } else if (measure.endsWith("lenient")) {
        row.add(f.format(differ.getPrecisionLenient()));
        row.add(f.format(differ.getRecallLenient()));
        row.add(f.format(differ.getFMeasureLenient(beta)));
      } else if (measure.endsWith("lenient BDM")) {
        row.add(f.format(getPrecisionLenientBdm()));
        row.add(f.format(getRecallLenientBdm()));
        row.add(f.format(getFMeasureLenientBdm(beta)));
      } else if (measure.endsWith("average")) {
        row.add(f.format(differ.getPrecisionAverage()));
        row.add(f.format(differ.getRecallAverage()));
        row.add(f.format(differ.getFMeasureAverage(beta)));
      } else if (measure.endsWith("average BDM")) {
        row.add(f.format(getPrecisionAverageBdm()));
        row.add(f.format(getRecallAverageBdm()));
        row.add(f.format(getFMeasureAverageBdm(beta)));
      }
    }
    return row;
  }

  /**
   * Be careful, don't modify it.
   * That's not a copy because it would take too much memory.
   * @return differ by type map
   */
  public Map<String, AnnotationDiffer> getDifferByTypeMap() {
    return differByTypeMap;
  }

  protected Map<String, Float> bdmByTypeMap = new HashMap<String, Float>();
  protected URL bdmFileUrl;
  protected Map<String, AnnotationDiffer> differByTypeMap =
    new HashMap<String, AnnotationDiffer>();
  protected Map<String, Float> bdmByConceptsMap;
}