GATE.ac.uk - wiki/code-repository/src/andrewgolightly/nlp/gate/prs/Goldfish.java

/**
 * Counts the number of times the word "Goldfish" appears in a sentence. That total is 
 * added as a feature to every sentence annotation.
 *
 * Also adds summary information to each document, namely: 
 *
 *                    - total number of characters in document
 *                    - total number of tokens in document
 *                    - total number of words in document
 *                    - total number of sentences
 *                    - total "Goldfish" count 
 *
 * @author Andrew Golightly (acg4@cs.waikato.ac.nz)
 *         -- last updated 18/05/2003
 */

package andrewgolightly.nlp.gate.prs;

import gate.Annotation;
import gate.AnnotationSet;
import gate.creole.ExecutionException;
import gate.creole.ANNIEConstants;

import java.util.Iterator;

public class Goldfish extends gate.creole.AbstractLanguageAnalyser {

  private String inputASname, outputASname;

  public String getinputASname() {
    return inputASname;
  }

  public void setinputASname(String inputASname) {
    this.inputASname = inputASname;
  }

  public String getoutputASname() {
    return outputASname;
  }

  public void setoutputASname(String outputASname) {
    this.outputASname = outputASname;
  }

  public void execute() throws ExecutionException {
    gate.Document doc = getDocument();
    int totalGoldfishCount = 0;

    doc.getFeatures().clear();

    AnnotationSet inputAnnSet = (inputASname == null || inputASname.length() == 0)
            ? doc.getAnnotations()
            : doc.getAnnotations(inputASname);

    AnnotationSet outputAnnSet = (outputASname == null || outputASname.length() == 0)
            ? doc.getAnnotations()
            : doc.getAnnotations(outputASname);

    doc.getFeatures().put("Number of characters",
            new Integer(doc.getContent().toString().length()).toString());
    try {
      doc.getFeatures().put(
              "Number of tokens",
              new Integer(inputAnnSet.get(ANNIEConstants.TOKEN_ANNOTATION_TYPE)
                      .size()).toString());
    }
    catch(NullPointerException e) {
      throw new ExecutionException(
              "You need to run the English Tokenizer first!");
    }
    try {
      doc.getFeatures().put(
              "Number of sentences",
              new Integer(inputAnnSet.get(
                      ANNIEConstants.SENTENCE_ANNOTATION_TYPE).size())
                      .toString());
    }
    catch(NullPointerException e) {
      throw new ExecutionException(
              "You need to run the Sentence Splitter first!");
    }

    // iterate through the sentences
    Iterator sentenceIterator = inputAnnSet.get(
            ANNIEConstants.SENTENCE_ANNOTATION_TYPE).iterator(), tokenIterator;
    int wordCount = 0;
    while(sentenceIterator.hasNext()) {
      Annotation sentenceAnnotation = (Annotation)sentenceIterator.next();
      tokenIterator = inputAnnSet.get(ANNIEConstants.TOKEN_ANNOTATION_TYPE,
              sentenceAnnotation.getStartNode().getOffset(),
              sentenceAnnotation.getEndNode().getOffset()).iterator();

      // iterate through the tokens in the current sentence
      int sentenceGoldfishCount = 0;
      String word;

      while(tokenIterator.hasNext()) {
        Annotation tokenAnnotation = (Annotation)tokenIterator.next();
        if(tokenAnnotation.getFeatures().get(
                ANNIEConstants.TOKEN_KIND_FEATURE_NAME).equals("word"))
          wordCount++;
        word = (String)tokenAnnotation.getFeatures().get(
                ANNIEConstants.TOKEN_STRING_FEATURE_NAME);
        if(word.equals("Goldfish")) {
          try {
            outputAnnSet.add(tokenAnnotation.getStartNode().getOffset(),
                    tokenAnnotation.getEndNode().getOffset(), "Goldfish",
                    gate.Factory.newFeatureMap());
          }
          catch(gate.util.InvalidOffsetException ioe) {
            throw new ExecutionException(ioe);
          }
          sentenceGoldfishCount++;
        }
      }

      sentenceAnnotation.getFeatures().put(new String("Goldfish Count"),
              new Integer(sentenceGoldfishCount));

      totalGoldfishCount += sentenceGoldfishCount;
    }

    doc.getFeatures().put("Number of words", new Integer(wordCount).toString());
    doc.getFeatures().put("Total \"Goldfish\" count",
            new Integer(totalGoldfishCount).toString());
  }
}