Log in Help
Print
Homereleasesgate-8.0-build4825-ALLpluginsCrowd_Sourcingsrcgatecrowdsourcerest 〉 CrowdFlowerClient.java
 
/*
 *  CrowdFlowerClient.java
 *
 *  Copyright (c) 1995-2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 3, June 2007 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 *  $Id: CrowdFlowerClient.java 17796 2014-04-10 12:28:34Z ian_roberts $
 */
package gate.crowdsource.rest;

import static gate.crowdsource.CrowdFlowerConstants.*;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Utils;
import gate.util.GateRuntimeException;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;

public class CrowdFlowerClient {

  private static final Logger log = Logger.getLogger(CrowdFlowerClient.class);

  public static final String CF_ENDPOINT = "https://api.crowdflower.com/v1";

  private String apiKey;

  public CrowdFlowerClient(String apiKey) {
    this.apiKey = apiKey;
  }

  /**
   * Create a new entity classification job on CrowdFlower.
   * 
   * @param title the job title
   * @param instructions the instructions
   * @param caption a caption for the answer form, which can include an
   *          {{entity}} placeholder which will be replaced by the text
   *          of the entity being classified (e.g.
   *          "choose the most appropriate description for {{entity}}")
   * @param commonOptions common options that should be presented for
   *          all tasks, in addition to the specific options for that
   *          task extracted from the entity annotation. This is
   *          expressed as a List of [value, description] two-element
   *          Lists, one for each option.
   * @return the newly created job ID.
   * @throws IOException
   */
  public long createClassificationJob(String title, String instructions,
          String caption, List<List<String>> commonOptions) throws IOException {
    log.debug("Creating classification job");
    log.debug("title: " + title);
    log.debug("instructions: " + instructions);
    log.debug("caption: " + caption);
    log.debug("options: " + commonOptions);
    // load the custom CSS that makes entity highlighting work
    InputStream cssStream =
            CrowdFlowerClient.class.getResourceAsStream("gate-crowdflower.css");
    String css = null;
    try {
      css = IOUtils.toString(cssStream, "UTF-8");
    } finally {
      cssStream.close();
    }
    StringWriter cml = new StringWriter();

    // construct the CML with the specified caption and common radio
    // options
    cml.append("<h2 id=\"unit_text\">{{text}}</h2>\n\n" + "{% if detail %}\n"
            + "  <div class=\"well\">{{detail}}</div>\n"
            + "{% endif %}\n" + "<cml:radios validates=\"required\" label=\"");
    StringEscapeUtils.escapeXml(cml, caption);
    cml.append("\" name=\"answer\">\n" + "  {% for opt in options %}\n"
            + "    {% if opt.description %}\n"
            + "      {% assign desc = opt.description %}\n"
            + "    {% else %}\n" + "      {% assign desc = opt.value %}\n"
            + "    {% endif %}\n"
            + "    <cml:radio value=\"{{opt.value}}\" label=\"{{desc}}\" />\n"
            + "  {% endfor %}\n");
    if(commonOptions != null) {
      for(List<String> opt : commonOptions) {
        cml.append("  <cml:radio value=\"");
        StringEscapeUtils.escapeXml(cml, opt.get(0));
        cml.append("\" label=\"");
        StringEscapeUtils.escapeXml(cml, opt.get(1));
        cml.append("\" />\n");
      }
    }
    cml.append("</cml:radios>\n");

    log.debug("cml: " + cml.toString());

    log.debug("POSTing to CrowdFlower");
    JsonElement json =
            post("/jobs", "job[title]", title, "job[instructions]",
                    instructions, "job[cml]", cml.toString(), "job[css]", css);
    log.debug("CrowdFlower returned " + json);
    try {
      return json.getAsJsonObject().get("id").getAsLong();
    } catch(Exception e) {
      throw new GateRuntimeException("Failed to create CF job");
    }
  }

  /**
   * <p>
   * Create a single unit in a classification job for the given
   * annotation. If the unit is created successfully, its ID will be
   * stored as a {@link Long} valued feature named <code>cf_unit</code>
   * on the target annotation.
   * </p>
   * 
   * <p>
   * The target annotation must have a feature named "options" whose
   * value is either a <code>Collection</code> of valid choices or a
   * <code>Map</code> where each key represents one option value and the
   * corresponding value is the human-readable description that will be
   * displayed to the annotator (e.g. the key could be an ontology
   * instance URI and the value its rdfs:label). The options will be
   * added to the unit in the order they are delivered by the iterator,
   * so if order is important you should use a collection or map with
   * predictable iteration order, e.g. {@link List} or
   * {@link LinkedHashMap}.
   * </p>
   * 
   * <p>
   * If the target annotation has a feature named "correct" then it will
   * be treated as a gold-standard unit. The "correct" feature must
   * match one of the "options" (i.e. one of the <i>keys</i> if options
   * is a Map) or one of the common options defined when the job was
   * created - typically things like "none" (none of the available
   * options is correct) or "nae" (the target is not an entity).
   * </p>
   * 
   * @param jobId the CrowdFlower job ID
   * @param doc the document containing the annotation
   * @param asName the annotation set containing the target annotation
   * @param context the "context" annotation, i.e. and annotation
   *          covering the complete snippet of text (e.g. the sentence)
   *          within which the target entity will be highlighted in the
   *          unit. This need not live in the same annotation set as the
   *          <code>target</code> but must cover the target's span.
   * @param target the annotation representing the entity to be
   *          classified.
   * @return the ID of the generated unit.
   */
  public long createClassificationUnit(long jobId, Document doc, String asName,
          Annotation context, Annotation target) {
    String text = Utils.stringFor(doc, context);
    String entity = Utils.stringFor(doc, target);
    String documentId = String.valueOf(doc.getLRPersistenceId());
    int formDataSize = 10; // text + entity + docId + asName + annId

    // insert highlight span into text
    int entityStart = (int)(Utils.start(target) - Utils.start(context));
    int entityEnd = entityStart + entity.length();
    text =
            text.substring(0, entityStart) + "<span class=\"gate-entity\">"
                    + entity + "</span>" + text.substring(entityEnd);

    Object options = target.getFeatures().get("options");
    if(options != null) {
      if(options instanceof Map) {
        formDataSize += (4 * ((Map<?, ?>)options).size());
      } else if(options instanceof Collection) {
        formDataSize += (2 * ((Collection<?>)options).size());
      }
    }

    if(target.getFeatures().get("detail") != null) {
      formDataSize += 2;
    }

    String correctAnswer = (String)target.getFeatures().get("correct");
    String reason = (String)target.getFeatures().get("reason");
    if(correctAnswer != null) {
      formDataSize += 4; // "golden" + answer
      if(reason != null) {
        formDataSize += 2; // answer_gold_reason
      }
    }

    String[] formData = new String[formDataSize];
    int i = 0;
    formData[i++] = "unit[data][text]";
    formData[i++] = text;
    formData[i++] = "unit[data][entity]";
    formData[i++] = entity;
    formData[i++] = "unit[data][documentId]";
    formData[i++] = documentId;
    formData[i++] = "unit[data][asName]";
    formData[i++] = (asName == null ? "" : asName);
    formData[i++] = "unit[data][annId]";
    formData[i++] = String.valueOf(target.getId());
    if(options != null) {
      if(options instanceof Map) {
        for(Map.Entry<?, ?> opt : ((Map<?, ?>)options).entrySet()) {
          formData[i++] = "unit[data][options][][value]";
          formData[i++] = String.valueOf(opt.getKey());
          formData[i++] = "unit[data][options][][description]";
          formData[i++] = String.valueOf(opt.getValue());
        }
      } else if(options instanceof Collection) {
        for(Object opt : (Collection<?>)options) {
          formData[i++] = "unit[data][options][][value]";
          formData[i++] = String.valueOf(opt);
        }
      }
    }

    if(target.getFeatures().get("detail") != null) {
      formData[i++] = "unit[data][detail]";
      formData[i++] = target.getFeatures().get("detail").toString();
    }

    if(correctAnswer != null) {
      formData[i++] = "unit[golden]";
      formData[i++] = "true";
      formData[i++] = "unit[data][answer_gold]";
      formData[i++] = correctAnswer;
      if(reason != null) {
        formData[i++] = "unit[data][answer_gold_reason]";
        formData[i++] = reason;
      }
    }

    try {
      JsonElement json = post("/jobs/" + jobId + "/units", formData);
      long unitId = json.getAsJsonObject().get("id").getAsLong();
      // store the unit ID on the annotation
      target.getFeatures().put(UNIT_ID_FEATURE_NAME, Long.valueOf(unitId));
      return unitId;
    } catch(IOException e) {
      throw new GateRuntimeException("Could not add unit for annotation "
              + target, e);
    }
  }

  /**
   * Get the list of judgments for the given unit. If there are no
   * judgments, null is returned.
   * 
   * @param jobId the CrowdFlower job identifier
   * @param unitId the unit identifier
   * @return the "judgments" array for this unit, or null if none found.
   */
  public JsonArray getJudgments(long jobId, long unitId) {
    try {
      String uri = "/jobs/" + jobId + "/units/" + unitId;
      JsonElement unitResponse = get(uri);
      if(!unitResponse.isJsonObject()) {
        throw new GateRuntimeException("Response from " + uri
                + " was not a JSON object");
      }
      JsonElement results = unitResponse.getAsJsonObject().get("results");
      if(!results.isJsonObject()) {
        log.info("No results found for job " + jobId + " unit " + unitId);
        return null;
      }
      JsonElement judgments = results.getAsJsonObject().get("judgments");
      if(judgments.isJsonArray()) {
        return judgments.getAsJsonArray();
      } else {
        return null;
      }
    } catch(IOException e) {
      throw new GateRuntimeException("Could not retrieve unit details", e);
    }
  }

  /**
   * Create a named entity annotation job on CrowdFlower.
   * 
   * @param title the job title
   * @param instructions the instructions
   * @param caption a caption for the answer form, which should include
   *          the entity type to be annotated.
   * @param noEntitiesCaption a caption for the "there are no entities"
   *          checkbox.
   * @return the newly created job ID.
   * @throws IOException
   */
  public long createAnnotationJob(String title, String instructions,
          String caption, String noEntitiesCaption) throws IOException {
    log.debug("Creating annotation job");
    log.debug("title: " + title);
    log.debug("instructions: " + instructions);
    log.debug("caption: " + caption);

    // load the CSS that makes highlighting work
    InputStream cssStream =
            CrowdFlowerClient.class.getResourceAsStream("gate-crowdflower.css");
    String css = null;
    try {
      css = IOUtils.toString(cssStream, "UTF-8");
    } finally {
      cssStream.close();
    }

    // load the JavaScript that toggles the colour of tokens when
    // clicked
    InputStream jsStream =
            CrowdFlowerClient.class.getResourceAsStream("gate-crowdflower.js");
    String js = null;
    try {
      js = IOUtils.toString(jsStream, "UTF-8");
    } finally {
      jsStream.close();
    }

    // construct the CML
    StringWriter cml = new StringWriter();
    cml.append("<div class=\"gate-snippet\">\n"
            + "  <cml:checkboxes validates=\"required\" label=\"");
    StringEscapeUtils.escapeXml(cml, caption);
    cml.append("\" name=\"answer\">\n"
            + "    {% for tok in tokens %}\n"
            + "      <cml:checkbox label=\"{{ tok }}\" value=\"{{ forloop.index0 }}\" />\n"
            + "    {% endfor %}\n" + "  </cml:checkboxes>\n" + "</div>\n"
            + "{% if detail %}\n"
            + "  <div class=\"well\">{{detail}}</div>\n"
            + "{% endif %}\n" + "<div class=\"gate-no-entities\">\n"
            // TODO work out how to customize the validation error
            // message
            + "  <cml:checkbox name=\"noentities\" label=\"");
    StringEscapeUtils.escapeXml(cml, noEntitiesCaption);
    cml.append("\" value=\"1\"\n"
            + "      only-if=\"!answer:required\" validates=\"required\"/>\n"
            + "</div>\n");
    log.debug("cml: " + cml.toString());

    log.debug("POSTing to CrowdFlower");
    JsonElement json =
            post("/jobs", "job[title]", title, "job[instructions]",
                    instructions, "job[cml]", cml.toString(), "job[css]", css,
                    "job[js]", js);
    log.debug("CrowdFlower returned " + json);
    try {
      return json.getAsJsonObject().get("id").getAsLong();
    } catch(Exception e) {
      throw new GateRuntimeException("Failed to create CF job");
    }
  }

  /**
   * Create a single unit for an entity annotation job.
   * 
   * @param jobId the CrowdFlower job ID
   * @param doc the document containing the annotation
   * @param asName the annotation set containing the snippet annotation
   * @param snippet an annotation covering the snippet of text that will
   *          be presented for annotation, typically a Sentence or Tweet
   * @param detail additional details to be shown to the annotator below
   *          the snippet, e.g. a list of URL links that they might want
   *          to follow for more information.  May be null, in which case
   *          no detail section will be added.
   * @param tokens annotations representing the individual substrings of
   *          the snippet that will be the atomic units of annotation.
   *          Typically these will be Token annotations. The supplied
   *          "tokens" should completely cover the non-whitespace
   *          content of the snippet, but need not cover all the
   *          intervening spaces.
   * @param correctAnnotations annotations representing the "correct"
   *          answer - if this parameter is not <code>null</code> then
   *          the unit will be considered as gold-standard data. This
   *          includes the case where an empty annotation set is
   *          provided, as this represents a gold snippet where the
   *          correct answer is that this snippet contains no entities.
   * @param goldReason for a gold-standard unit, the <em>reason</em> why
   *          the annotations should be considered correct. This will be
   *          shown to users as feedback if they get the gold unit
   *          wrong. Ignored for non-gold units.
   * @return the ID of the newly-created unit.
   */
  public long createAnnotationUnit(long jobId, Document doc, String asName,
          Annotation snippet, String detail, AnnotationSet tokens,
          AnnotationSet correctAnnotations, String goldReason) {
    String documentId = String.valueOf(doc.getLRPersistenceId());
    int formDataSize = 6; // docId + asName + annId
    List<Annotation> tokensList = Utils.inDocumentOrder(tokens);
    formDataSize += 2 * tokensList.size();
    if(detail != null) {
      formDataSize += 2;
    }
    Set<Integer> answerGold = null;
    if(correctAnnotations != null) {
      // gold unit
      answerGold = new HashSet<Integer>();
      for(Annotation a : correctAnnotations) {
        for(int i = 0; i < tokensList.size(); i++) {
          Annotation tokenI = tokensList.get(i);
          if(Utils.start(tokenI) >= Utils.start(a)
                  && Utils.end(tokenI) <= Utils.end(a)) {
            answerGold.add(i);
          }
        }
      }
      formDataSize += 2; // "golden"
      if(answerGold.size() == 0) {
        formDataSize += 2; // noentities=1
      } else {
        formDataSize += 2 * answerGold.size(); // answer=N for each
                                               // token
      }
      if(goldReason != null) {
        formDataSize += 2; // answer_gold_reason or
                           // noentities_gold_reason
      }
    }

    String[] formData = new String[formDataSize];
    int i = 0;
    formData[i++] = "unit[data][documentId]";
    formData[i++] = documentId;
    formData[i++] = "unit[data][asName]";
    formData[i++] = (asName == null ? "" : asName);
    formData[i++] = "unit[data][annId]";
    formData[i++] = String.valueOf(snippet.getId());
    for(Annotation tok : tokensList) {
      formData[i++] = "unit[data][tokens][]";
      formData[i++] = Utils.stringFor(doc, tok);
    }
    if(detail != null) {
      formData[i++] = "unit[data][detail]";
      formData[i++] = detail;
    }
    if(answerGold != null) {
      formData[i++] = "unit[golden]";
      formData[i++] = "true";
      if(answerGold.size() == 0) {
        formData[i++] = "unit[data][noentities_gold]";
        formData[i++] = "1";
        if(goldReason != null) {
          formData[i++] = "unit[data][noentities_gold_reason]";
          formData[i++] = goldReason;
        }
      } else {
        Integer[] goldArray =
                answerGold.toArray(new Integer[answerGold.size()]);
        Arrays.sort(goldArray);
        for(Integer tokIndex : goldArray) {
          formData[i++] = "unit[data][answer_gold][]";
          formData[i++] = String.valueOf(tokIndex);
        }
        if(goldReason != null) {
          formData[i++] = "unit[data][answer_gold_reason]";
          formData[i++] = goldReason;
        }
      }
    }

    try {
      JsonElement json = post("/jobs/" + jobId + "/units", formData);
      long unitId = json.getAsJsonObject().get("id").getAsLong();
      return unitId;
    } catch(IOException e) {
      throw new GateRuntimeException("Could not add unit for annotation "
              + snippet, e);
    }
  }

  protected JsonElement post(String uri, String... formData) throws IOException {
    return request("POST", uri, formData);
  }

  protected JsonElement get(String uri) throws IOException {
    return request("GET", uri);
  }

  protected JsonElement request(String method, String uri, String... formData)
          throws IOException {
    if(log.isDebugEnabled()) {
      log.debug("URI: " + uri + ", formData: " + Arrays.toString(formData));
    }
    URL cfUrl = new URL(CF_ENDPOINT + uri + "?key=" + apiKey);
    HttpURLConnection connection = (HttpURLConnection)cfUrl.openConnection();
    connection.setRequestMethod(method);
    connection.setRequestProperty("Accept", "application/json");
    if(formData != null && formData.length > 0) {
      // send the form data, URL encoded
      connection.setDoOutput(true);
      connection.setRequestProperty("Content-Type",
              "application/x-www-form-urlencoded");

      // annoyingly CrowdFlower doesn't support chunked streaming of
      // POSTs so we have to accumulate the content in a buffer, work
      // out its size and then POST with a Content-Length header
      ByteArrayOutputStream buffer = new ByteArrayOutputStream(4096);
      PrintWriter writer =
              new PrintWriter(new OutputStreamWriter(buffer, "UTF-8"));
      try {
        for(int i = 0; i < formData.length; i++) {
          String fieldName = formData[i];
          String fieldValue = formData[++i];
          if(i > 0) {
            writer.write("&");
          }
          writer.write(URLEncoder.encode(fieldName, "UTF-8"));
          writer.write("=");
          writer.write(URLEncoder.encode(fieldValue, "UTF-8"));
        }
      } finally {
        writer.close();
      }

      connection.setFixedLengthStreamingMode(buffer.size());
      OutputStream connectionStream = connection.getOutputStream();
      buffer.writeTo(connectionStream);
      connectionStream.close();
    }

    // parse the response as JSON
    JsonParser parser = new JsonParser();
    Reader responseReader =
            new InputStreamReader(connection.getInputStream(), "UTF-8");
    try {
      return parser.parse(responseReader);
    } finally {
      responseReader.close();
    }
  }
}