GATE.ac.uk - releases/gate-8.4-build5748-ALL/plugins/AlchemyAPI/src/gate/alchemyAPI/KeywordExtraction.java

/*
 * Copyright (c) 2009-2013, The University of Sheffield.
 * 
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * Licensed under the GNU Library General Public License, Version 3, June 2007
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 */

package gate.alchemyAPI;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Utils;
import gate.creole.ExecutionException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;

import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import com.alchemyapi.api.AlchemyAPI_KeywordParams;

/**
 * @author Mark A. Greenwood
 */
@CreoleResource(name = "AlchemyAPI: Keyword Extraction", comment = "Runs the AlchemyAPI Keyword Extraction service on a GATE document", icon = "Alchemy")
public class KeywordExtraction extends AbstractAlchemyPR {

  private static final long serialVersionUID = -8443994795704361590L;

  /**
   * Number of sentences to be sent to Alchemy API to process in one batch
   * Default is 10 sentences.
   */
  private Integer numberOfSentencesInBatch = 10;

  /**
   * This is the number of sentences that are used as sentences in context both
   * on left and right side and not annotated when sent as part of a batch
   * unless there's no more sentence to be considered as part of the context.
   */
  private Integer numberOfSentencesInContext = 5;

  protected String annotationType;

  /** debug */
  private boolean DEBUG = false;

  /**
   * Should be called to execute this PR on a document.
   */
  public void execute() throws ExecutionException {
    fireStatusChanged("Checking runtime parameters");
    progressChanged(0);
    // if no document provided
    if(document == null) { throw new ExecutionException("Document is null!"); }

    // obtain the content
    String documentContent = document.getContent().toString();

    if(documentContent.trim().length() == 0) return;

    // annotation set to use
    AnnotationSet set =
      outputASName == null || outputASName.trim().length() == 0 ? document
        .getAnnotations() : document.getAnnotations(outputASName);
    AnnotationSet inputAS =
      inputASName == null || inputASName.trim().length() == 0 ? document
        .getAnnotations() : document.getAnnotations(inputASName);
    // start time
    long startTime = System.currentTimeMillis();
    // all sentences
    List<Annotation> allSents = Utils.inDocumentOrder(inputAS.get("Sentence"));
    // if no sentence annotations report that
    if(allSents.size() == 0) { throw new ExecutionException(
      "Atleast one sentence must be provided"); }
    // Tokens
    List<Annotation> allTokens = Utils.inDocumentOrder(inputAS.get("Token"));

    // if no token annotations report that
    if(allTokens.size() == 0) { throw new ExecutionException(
      "Document must have Tokens"); }

    for(int i = 0; i < allSents.size(); i += numberOfSentencesInBatch) {
      if(interrupted) return;
      int endIndex = i + numberOfSentencesInBatch - 1;
      if(endIndex >= allSents.size()) endIndex = allSents.size() - 1;

      // we add numberOfSentencesInContext in left and right context if
      // they are available
      int contextStartIndex = i - numberOfSentencesInContext;

      if(contextStartIndex < 0) contextStartIndex = 0;
      int contextEndIndex = endIndex + numberOfSentencesInContext;
      if(contextEndIndex >= allSents.size()) {
        contextEndIndex = allSents.size() - 1;
      }

      // obtain the string to be annotated
      String sentString =
        Utils.stringFor(document, Utils.start(allSents.get(contextStartIndex)),
          Utils.end(allSents.get(contextEndIndex)));

      // the actual content
      String contentString =
        Utils.stringFor(document, Utils.start(allSents.get(i)),
          Utils.end(allSents.get(endIndex)));

      // progress
      progressChanged((int)(i * 90 / (double)allSents.size()));
      fireStatusChanged("Posted a part of document for processing");
      // now process the text
      // post the content to a service and obtain output
      // what we get back is the mathcing text which uri in them
      List<TextSpan> result = process(sentString.toString());
      fireStatusChanged("Copying annotations on the document");

      // since we don't have any offsets returned from the alchemy API we
      // first annotated longer spans and ignored the overlapping smaller spans
      Collections.sort(result);

      for(TextSpan r : result) {
        int index = -1;
        // annotate all occurrences of the matching Text
        while((index = contentString.indexOf(r.text, index + 1)) >= 0) {
          int stOffset = Utils.start(allSents.get(i)).intValue() + index;
          int enOffset = stOffset + r.text.length();
          try {
            if(set.getCovering(annotationType, (long)stOffset, (long)enOffset)
              .isEmpty()) {
              FeatureMap map = Factory.newFeatureMap();
              map.putAll(r.featureMap);
              set.add((long)stOffset, (long)enOffset, annotationType, map);
            }
          } catch(InvalidOffsetException e) {
            throw new ExecutionException(e);
          }
        }
      }
    }

    fireStatusChanged("deleting Mention annotations not in sync with Tokens");
    // get rid of annotations that donot sync with token boundaries
    AnnotationSet mentionSet = set.get("Mention");
    Set<Annotation> toDelete = new HashSet<Annotation>();
    for(Annotation aMention : mentionSet) {
      long start = Utils.start(aMention);
      long end = Utils.end(aMention);
      // obtain the contained token annotations
      AnnotationSet tokenSet = inputAS.get("Token").getContained(start, end);
      if(tokenSet.isEmpty()) {
        toDelete.add(aMention);
        continue;
      }
      // sort the tokens
      // start and end boundaries must be in sync
      List<Annotation> orderedTokens = Utils.inDocumentOrder(tokenSet);
      if(start != Utils.start(orderedTokens.get(0)).longValue() ||
        end != Utils.end(orderedTokens.get(orderedTokens.size() - 1))
          .longValue()) {
        toDelete.add(aMention);
        continue;
      }
    }
    // delete the mentions not in sync with tokens boundaries
    for(Annotation toDelAnnot : toDelete) {
      set.remove(toDelAnnot);
    }
    // progress
    progressChanged(100);
    // let everyone who is interested know that we have now finished
    fireStatusChanged(document.getName() +
      " tagged with AlchemyAPIServicePR in " +
      NumberFormat.getInstance().format(
        (double)(System.currentTimeMillis() - startTime) / 1000) + " seconds!");
  }

  /**
   * Using URLConnection to connect to alchemy API
   * 
   * @param text
   * @return
   * @throws ExecutionException
   */
  protected List<TextSpan> process(String text) throws ExecutionException {

    if(DEBUG) System.out.println("About to process: " + text);

    // the list of results we will evenutally return
    List<TextSpan> toReturn = new ArrayList<TextSpan>();

    // the params to configure the service
    AlchemyAPI_KeywordParams params = new AlchemyAPI_KeywordParams();

    // get the XML result doc back from the service
    Document doc = null;
    try {
      doc = alchemy.TextGetRankedKeywords(text, params);
    } catch(Exception e) {
      e.printStackTrace();
      throw new ExecutionException(e);
    }

    // convert the XML into TextSpan instances
    NodeList entitiesList = doc.getElementsByTagName("keyword");
    for(int i = 0; i < entitiesList.getLength(); i++) {
      Node n = entitiesList.item(i);
      NodeList children = n.getChildNodes();
      TextSpan r = new TextSpan();

      for(int j = 0; j < children.getLength(); j++) {
        Node cn = children.item(j);
        if(cn.getNodeName().equals("text")) {
          r.text = cn.getTextContent();
        } else if(cn.getNodeName().equals("relevance")) {
          r.featureMap.put("relevance", cn.getTextContent());
        }
      }

      if(DEBUG) System.out.println(r);

      toReturn.add(r);
    }
    return toReturn;
  }

  public Integer getNumberOfSentencesInBatch() {
    return numberOfSentencesInBatch;
  }

  @CreoleParameter(comment = "Number of sentences to be sent to Alchemy API to process in one batch")
  @RunTime
  public void setNumberOfSentencesInBatch(Integer numberOfSentencesInBatch) {
    this.numberOfSentencesInBatch = numberOfSentencesInBatch;
  }

  public Integer getNumberOfSentencesInContext() {
    return numberOfSentencesInContext;
  }

  @CreoleParameter(comment = "This is the number of sentences that are used as sentences in context both on left and right side and not annotated when sent as part of a less there are no more sentences to be considered as part of the context")
  @RunTime
  public void setNumberOfSentencesInContext(Integer numberOfSentencesInContext) {
    this.numberOfSentencesInContext = numberOfSentencesInContext;
  }

  public String getAnnotationType() {
    return annotationType;
  }

  @RunTime
  @CreoleParameter(defaultValue = "Keyword")
  public void setAnnotationType(String annotationType) {
    this.annotationType = annotationType;
  }
}