GATE.ac.uk - releases/gate-8.4-build5748-ALL/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationResultsImporter.java

/*
 *  EntityAnnotationResultsImporter.java
 *
 *  Copyright (c) 1995-2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 3, June 2007 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 *  $Id: EntityAnnotationResultsImporter.java 18461 2014-11-17 17:51:09Z ian_roberts $
 */
package gate.crowdsource.ne;

import static gate.crowdsource.CrowdFlowerConstants.*;

import java.util.List;

import org.apache.log4j.Logger;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;

import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.Resource;
import gate.Utils;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.crowdsource.rest.CrowdFlowerClient;
import gate.util.InvalidOffsetException;

@CreoleResource(name = "Entity Annotation Results Importer",
comment = "Import judgments from a CrowdFlower job created by "
        + "the Entity Annotation Job Builder as GATE annotations.",
helpURL = "http://gate.ac.uk/userguide/sec:crowd:annotation:import")
public class EntityAnnotationResultsImporter
                                                extends
                                                  AbstractLanguageAnalyser {

  private static final long serialVersionUID = 3424823295729835240L;

  private static final Logger log = Logger
          .getLogger(EntityAnnotationResultsImporter.class);

  private String apiKey;

  private Long jobId;

  private String resultAnnotationType;

  private String resultASName;
  
  private String snippetAnnotationType;
  
  private String snippetASName;

  private String tokenAnnotationType;
  
  private String tokenASName;
  
  private Boolean annotateSpans;
  
  protected CrowdFlowerClient crowdFlowerClient;

  public String getApiKey() {
    return apiKey;
  }

  @CreoleParameter(comment = "CrowdFlower API key")
  public void setApiKey(String apiKey) {
    this.apiKey = apiKey;
  }

  public Long getJobId() {
    return jobId;
  }

  @RunTime
  @CreoleParameter
  public void setJobId(Long jobId) {
    this.jobId = jobId;
  }

  public String getResultAnnotationType() {
    return resultAnnotationType;
  }

  @RunTime
  @CreoleParameter
  public void setResultAnnotationType(String resultAnnotationType) {
    this.resultAnnotationType = resultAnnotationType;
  }

  public String getResultASName() {
    return resultASName;
  }

  @Optional
  @RunTime
  @CreoleParameter(defaultValue = "crowdResults")
  public void setResultASName(String resultASName) {
    this.resultASName = resultASName;
  }

  public String getSnippetAnnotationType() {
    return snippetAnnotationType;
  }

  @RunTime
  @CreoleParameter(defaultValue = "Sentence", comment = "Annotation type " +
  		"representing the snippets (one snippet = one unit)")
  public void setSnippetAnnotationType(String snippetAnnotationType) {
    this.snippetAnnotationType = snippetAnnotationType;
  }

  public String getSnippetASName() {
    return snippetASName;
  }

  @Optional
  @RunTime
  @CreoleParameter(comment = "Annotation set where the snippets can be found")
  public void setSnippetASName(String snippetASName) {
    this.snippetASName = snippetASName;
  }

  public String getTokenAnnotationType() {
    return tokenAnnotationType;
  }

  @RunTime
  @CreoleParameter(defaultValue = "Token",
          comment = "Annotation type representing the \"tokens\" - the atomic " +
              "units that workers have selected to mark entity annotations.")
  public void setTokenAnnotationType(String tokenAnnotationType) {
    this.tokenAnnotationType = tokenAnnotationType;
  }

  public String getTokenASName() {
    return tokenASName;
  }

  @Optional
  @RunTime
  @CreoleParameter(comment = "Annotation set where tokens can be found")
  public void setTokenASName(String tokenASName) {
    this.tokenASName = tokenASName;
  }

  public Boolean getAnnotateSpans() {
    return annotateSpans;
  }

  @RunTime
  @CreoleParameter(comment = "If true (the default), create one annotation for " +
  		"each contiguous run of marked tokens, if false, annotate each token " +
  		"separately.", defaultValue = "true")
  public void setAnnotateSpans(Boolean annotateSpans) {
    this.annotateSpans = annotateSpans;
  }

  @Override
  public Resource init() throws ResourceInstantiationException {
    if(apiKey == null || "".equals(apiKey)) {
      throw new ResourceInstantiationException("API Key must be set");
    }
    crowdFlowerClient = new CrowdFlowerClient(apiKey);
    return this;
  }

  @Override
  public void execute() throws ExecutionException {
    if(isInterrupted()) throw new ExecutionInterruptedException();
    interrupted = false;
    try {
      if(jobId == null || jobId.longValue() <= 0) {
        throw new ExecutionException("Job ID must be provided");
      }

      AnnotationSet tokens = getDocument().getAnnotations(tokenASName).get(tokenAnnotationType);
      AnnotationSet snippetAnnotations = getDocument().getAnnotations(snippetASName)
              .get(snippetAnnotationType);
      AnnotationSet resultAS = getDocument().getAnnotations(resultASName);
      List<Annotation> allSnippets = Utils.inDocumentOrder(snippetAnnotations);
      
      for(Annotation snippet : allSnippets) {
        if(isInterrupted()) throw new ExecutionInterruptedException();
        Object unitId = snippet.getFeatures().get(resultAnnotationType + "_unit_id");
        if(unitId != null) {
          if(!(unitId instanceof Long)) {
            unitId = Long.valueOf(unitId.toString());
          }
          // find any existing result annotations within the span of this snippet
          // so we can avoid creating another annotation from this judgment if
          // one already exists
          AnnotationSet existingResults =
                  Utils.getContainedAnnotations(resultAS, snippet,
                          resultAnnotationType);
          // tokens under this snippet
          List<Annotation> snippetTokens = Utils.inDocumentOrder(
                  Utils.getContainedAnnotations(tokens, snippet));
          
          JsonArray judgments =
                  crowdFlowerClient.getJudgments(jobId,
                          ((Long)unitId).longValue());
          
          if(judgments != null) {
            for(JsonElement judgmentElt : judgments) {
              JsonObject judgment = judgmentElt.getAsJsonObject();
              JsonObject data = judgment.getAsJsonObject("data");
              JsonArray answer = data.getAsJsonArray("answer");
              Long judgmentId = judgment.get("id").getAsLong();
              Double trust = judgment.get("trust").getAsDouble();
              Long workerId = judgment.get("worker_id").getAsLong();
              String comment = null;
              if(data.get("comment") != null) {
                if(data.get("comment").isJsonPrimitive()) {
                  comment = data.get("comment").getAsString().trim();
                  if("".equals(comment)) {
                    comment = null;
                  }
                }
              }
              if(answer != null && answer.size() > 0) {
                // judgment says there are some entities to annotate.  Look for
                // sequences of consecutive token indices and create one result
                // annotation for each such sequence
                int startTok = 0;
                int curTok = startTok;
                while(curTok < answer.size()) {
                  // we've reached the end of an annotation if either
                  // (a) we want to annotate each token individually anyway or 
                  // (b) we're on the last element of answer or
                  // (c) the next element is not this+1
                  if(!annotateSpans || curTok == answer.size() - 1
                          || (answer.get(curTok).getAsInt() + 1) != answer.get(curTok + 1).getAsInt()) {
                    Long startOffset = snippetTokens.get(answer.get(startTok).getAsInt()).getStartNode().getOffset();
                    Long endOffset = snippetTokens.get(answer.get(curTok).getAsInt()).getEndNode().getOffset();
                    startTok = curTok + 1;
                    // check whether there's already an annotation at this location for this judgment
                    AnnotationSet existingEntities = existingResults.getContained(startOffset, endOffset);
                    boolean found = false;
                    for(Annotation a : existingEntities) {
                      if(judgmentId.equals(a.getFeatures().get(JUDGMENT_ID_FEATURE_NAME))) {
                        found = true;
                        break;
                      }
                    }
                    if(!found) {
                      // no existing annotation found, create one
                      try {
                        FeatureMap features = Utils.featureMap(
                                JUDGMENT_ID_FEATURE_NAME, judgmentId,
                                "trust", trust,
                                "worker_id", workerId);
                        if(comment != null) {
                          features.put("comment", comment);
                        }
                        resultAS.add(startOffset, endOffset, resultAnnotationType, features);
                      } catch(InvalidOffsetException e) {
                        throw new ExecutionException("Invalid offset obtained from existing annotation!", e);
                      }
                    }
                  }
                  curTok++;
                }
              }
            }
          } else {
            log.warn("Unit " + unitId + " has no judgments");
          }

        } else {
          log.warn("Found " + snippetAnnotationType + " annotation with no "
                  + UNIT_ID_FEATURE_NAME + " feature, ignoring");
        }
      }

    } finally {
      interrupted = false;
    }
  }

}