GATE.ac.uk - releases/gate-8.0-build4825-ALL/plugins/Twitter/src/gate/twitter/HashtagTokenizer.java

/*
 * HashtagTokenizer.java
 * 
 * Copyright (c) 1995-2014, The University of Sheffield. See the file
 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 * 
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * licenced under the GNU Library General Public License, Version 2, June 1991
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 * 
 * Mark A. Greenwood, 24 Jan 2014
 */
package gate.twitter;

import static gate.Utils.getAnnotationsAtOffset;
import static gate.Utils.stringFor;
import static org.apache.commons.lang.StringUtils.isAllLowerCase;
import static org.apache.commons.lang.StringUtils.isAllUpperCase;
import static org.apache.commons.lang.StringUtils.isAlpha;
import static org.apache.commons.lang.StringUtils.isNumeric;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.LanguageAnalyser;
import gate.Resource;
import gate.annotation.AnnotationSetImpl;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;

import java.net.URL;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

@CreoleResource(name = "Hashtag Tokenizer", icon = "HashtagTokenizer",
    comment = "Tokenizes Multi-Word Hashtags",
    helpURL = "http://gate.ac.uk/userguide/sec:social:twitter:hashtag")
public class HashtagTokenizer extends AbstractLanguageAnalyser {

  private static final long serialVersionUID = -7848183952807024913L;

  // A comparator that sorts annotations by length; longest first
  private static Comparator<Annotation> lengthComparator =
    new Comparator<Annotation>() {
      @Override
      public int compare(Annotation a1, Annotation a2) {
        long l1 = a1.getEndNode().getOffset() - a1.getStartNode().getOffset();
        long l2 = a2.getEndNode().getOffset() - a2.getStartNode().getOffset();

        return (int)(l2 - l1);
      }
    };

  // the gazetteer that looks for likely words
  private LanguageAnalyser gaz;

  // the URL from which the gazetteer is loaded
  private URL gazURL;

  // the names of the input and output annotation sets
  private String inputASName, outputASName;

  // a debug flag
  private Boolean debug = Boolean.FALSE;

  public Boolean getDebug() {
    return debug;
  }

  @CreoleParameter(defaultValue = "false")
  @RunTime
  @Optional
  public void setDebug(Boolean debug) {
    this.debug = debug;
  }

  public URL getGazetteerURL() {
    return gazURL;
  }

  @CreoleParameter(defaultValue = "resources/hashtag/gazetteer/lists.def")
  public void setGazetteerURL(URL gazURL) {
    this.gazURL = gazURL;
  }

  public String getInputASName() {
    return inputASName;
  }

  @CreoleParameter
  @RunTime
  @Optional
  public void setInputASName(String inputASName) {
    this.inputASName = inputASName;
  }

  public String getOutputASName() {
    return outputASName;
  }

  @CreoleParameter
  @RunTime
  @Optional
  public void setOutputASName(String outputASName) {
    this.outputASName = outputASName;
  }

  @Override
  public Resource init() throws ResourceInstantiationException {

    // load and configure the hidden gazetteer
    FeatureMap hidden = Factory.newFeatureMap();
    Gate.setHiddenAttribute(hidden, true);

    FeatureMap params = Factory.newFeatureMap();
    params.put("listsURL", gazURL);
    params.put("caseSensitive", Boolean.FALSE);

    params.put("longestMatchOnly", Boolean.FALSE);
    params.put("wholeWordsOnly", Boolean.FALSE);

    if(gaz == null) {
      gaz =
        (LanguageAnalyser)Factory.createResource(
          "gate.creole.gazetteer.DefaultGazetteer", params, hidden,
          "Hashtag Tokenizer Gazetteer");
    } else {
      gaz.setParameterValues(params);
      gaz.reInit();
    }

    return this;
  }

  @Override
  public void execute() throws ExecutionException {
    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);

    FeatureMap features = Factory.newFeatureMap();

    long startTime = System.currentTimeMillis();
    fireStatusChanged("Tokenizing Hashtags: " + document.getName());
    fireProgressChanged(0);
    int count = 0;
    
    // get all the lookups we are going to use for decomposition...
    AnnotationSet lookups = new AnnotationSetImpl(document);

    try {
      // run the gazetteer to produce the HashtagLookup annotations
      gaz.setParameterValue("annotationSetName", inputASName);
      gaz.setDocument(document);
      gaz.execute();

      // get all the hashtags
      AnnotationSet hashtags = inputAS.get("Hashtag");

      for(Annotation hashtag : inputAS.get("Hashtag")) {
        // for each hashtag in the document...

        AnnotationSet contained = inputAS.getContained(hashtag.getStartNode().getOffset(), hashtag.getEndNode().getOffset());
        
        //clear away any left overs from previous tags
        lookups.clear();

        // which are the HashtagLookup
        lookups.addAll(contained.get("HashtagLookup"));

        // any other Lookups the user has generated
        lookups.addAll(contained.get("Lookup"));

        // and any number tokens
        features = Factory.newFeatureMap();
        features.put("kind", "number");
        lookups.addAll(contained.get("Token", features));

        // the _ appears to be allowed so add them as well
        features = Factory.newFeatureMap();
        features.put("string", "_");
        lookups.addAll(contained.get("Token", features));
        
        if(isInterrupted()) { throw new ExecutionInterruptedException(
          "The execution of the hashtag tokenizer has been abruptly interrupted!"); }

        // this will hold the best we have seen so far
        List<List<Annotation>> fewestTokens = new ArrayList<List<Annotation>>();

        // get all the lookups that start at the beginning of the hashtag
        List<Annotation> start =
          sort(getAnnotationsAtOffset(lookups, hashtag.getStartNode()
            .getOffset() + 1));

        for(Annotation a : start) {
          // for each lookup search for a valid tokenization
          List<List<Annotation>> found =
            search(lookups, hashtag.getEndNode().getOffset(), a);

          if(found != null) {
            // if we found a contender and it's the best so far store it
            if(fewestTokens.isEmpty()) {
              fewestTokens.addAll(found);
            } else if(found.get(0).size() == fewestTokens.get(0).size()) {
              fewestTokens.addAll(found);
            } else if(found.get(0).size() < fewestTokens.get(0).size()) {
              fewestTokens.clear();
              fewestTokens.addAll(found);
            }
          }
        }

        if(debug && fewestTokens.size() > 1) {
          System.out.println(stringFor(document, hashtag));
          display(fewestTokens);
        }

        if(fewestTokens.isEmpty()) {
          // if we didn't find any sensible tokenizations then let's see if the
          // hashtag is mized case

          String tagText = stringFor(document, hashtag).substring(1);
          if("mixedCaps".equals(getTokenType(tagText)[1])) {
            // if we have a mixed case hahstag then let's assume it is
            // CamelCased and split it accordingly

            // TODO think about camel case which includes numbers

            // a list to hold the tokens
            List<Annotation> found = new ArrayList<Annotation>();

            // start looking for token breaks aftert the initial #
            long begin = hashtag.getStartNode().getOffset() + 1;

            for(String token : tagText
              .split("((?<=[a-z])(?=[A-Z]))|((?<=[A-Z]{2,})(?=[a-z]))")) {
              // split the token at the case changes...

              // create the annotation in the Lookup set and add it to the found
              // list
              found.add(lookups.get(lookups.add(begin,
                (begin += token.length()), "CamelToken",
                Factory.newFeatureMap())));
            }

            // record the tokenization so we can process it later
            fewestTokens.add(found);
          }

        }

        if(!fewestTokens.isEmpty()) {
          // if we found a valid tokenization then...

          // remove any existing Token annotations
          inputAS.removeAll(inputAS.get("Token").getContained(
            hashtag.getStartNode().getOffset(),
            hashtag.getEndNode().getOffset()));

          // create a punctuation Token over the #
          features = Factory.newFeatureMap();
          features.put("string", "#");
          features.put("length", "1");
          features.put("kind", "punctuation");
          outputAS.add(hashtag.getStartNode().getOffset(), hashtag
            .getStartNode().getOffset() + 1, "Token", features);

          // let's assume that the first one we found is best
          int prefered = 0;

          for(int i = 0; i < fewestTokens.size(); ++i) {
            // check those we have found and skip over any that contain single
            // letter or mixed case words

            boolean okay = true;
            for(Annotation a : fewestTokens.get(i)) {
              // single letter words are not great
              if(a.getEndNode().getOffset() - a.getStartNode().getOffset() == 1)
                okay = false;
            }

            if(okay) {
              // if it contains neither a single letter word or a mixed case
              // word then we should definitely prefer this one
              prefered = i;
              break;
            }
          }

          for(Annotation a : fewestTokens.get(prefered)) {
            // for each new token...

            // find where it starts/ends and its length
            long startOffset = a.getStartNode().getOffset();
            long endOffset = a.getEndNode().getOffset();
            String length = Long.toString(endOffset - startOffset);

            // get the actual text
            String string = stringFor(document, a);

            // work out what kind of token it is and if it is a word
            // what its orthography is
            String[] tokenType = getTokenType(string);
            String kind = tokenType[0];
            String orth = tokenType[1];

            // create the new Token annotation
            features = Factory.newFeatureMap();
            features.put("string", string);
            features.put("length", length);
            features.put("kind", kind);
            if(orth != null) features.put("orth", orth);
            outputAS.add(startOffset, endOffset, "Token", features);

            if(debug) {
              // for debug purposes add a matching set of HashtagToken
              // annotations
              features = Factory.newFeatureMap();
              features.put("string", string);
              features.put("length", length);
              features.put("kind", kind);
              if(orth != null) features.put("orth", orth);
              outputAS.add(startOffset, endOffset, "HashtagToken", features);
            }
          }
        } else if(debug) {
          System.err.println(stringFor(document, hashtag));
          
          AnnotationSet tokens = inputAS.get("Token").getContained(
            hashtag.getStartNode().getOffset()+1,
            hashtag.getEndNode().getOffset());
          
          for (Annotation token : tokens) {
            
            features = Factory.newFeatureMap();
            features.putAll(token.getFeatures());
            outputAS.add(token.getStartNode().getOffset(), token.getEndNode().getOffset(), "HashtagToken", features);
          }
        }

        fireProgressChanged(count++ * 100 / hashtags.size());
      }

      fireProcessFinished();
      fireStatusChanged("Hashtags in " +
        document.getName() +
        " tokenized in " +
        NumberFormat.getInstance().format(
          (double)(System.currentTimeMillis() - startTime) / 1000) +
        " seconds!");
    } catch(InvalidOffsetException e) {
      throw new ExecutionException(e);
    } catch(ResourceInstantiationException e) {
      throw new ExecutionException(e);
    } finally {
      gaz.setDocument(null);

      if(!debug) {
        inputAS.removeAll(inputAS.get("HashtagLookup"));
      }
    }
  }

  @Override
  public void cleanup() {
    Factory.deleteResource(gaz);
  }

  private static String[] getTokenType(String string) {
    String kind = "symbol";
    String orth = null;
    if(isAlpha(string)) {
      kind = "word";

      if(isAllLowerCase(string))
        orth = "lowercase";
      else if(isAllUpperCase(string))
        orth = "allCaps";
      else if(isAllUpperCase(string.substring(0, 1)) &&
        isAllLowerCase(string.substring(1)))
        orth = "upperInitial";
      else orth = "mixedCaps";
    } else if(isNumeric(string)) {
      kind = "number";
    } else if(string.matches("(\\p{Punct})+")) {
      kind = "punctuation";
    }

    return new String[]{kind, orth};
  }

  private void display(List<List<Annotation>> found) {
    for(List<Annotation> tokens : found) {
      System.out.print("   ");
      for(Annotation token : tokens) {
        System.out.print(stringFor(document, token) + " ");
      }
      System.out.print("\n");
    }
  }

  /**
   * Depth first search through a set of annotations to find a contiguous set
   * for a given character range.
   **/
  private List<List<Annotation>> search(AnnotationSet lookups, Long endOffset,
                                        Annotation token)
    throws InvalidOffsetException {

    if("mixedCaps"
      .equals(getTokenType(stringFor(lookups.getDocument(), token))[1]))
      return null;

    List<List<Annotation>> shortest = new ArrayList<List<Annotation>>();

    if(token.getEndNode().getOffset().equals(endOffset)) {
      // if the token we are starting from ends at the right place then we
      // can stop and just return the single token as the result
      List<Annotation> found = new ArrayList<Annotation>();
      found.add(token);

      shortest.add(found);
      return shortest;
    }

    if(endOffset - token.getEndNode().getOffset() > 1) {
      // if there are two or more characters after this token then...

      // get the rest of the text of the hashtag
      String rest =
        lookups.getDocument().getContent()
          .getContent(token.getEndNode().getOffset() - 1, endOffset).toString();

      if(rest.substring(1).matches(rest.substring(0, 1) + "+")) {
        // if the rest of the hashtag is just the same as the last letter of
        // this token then someone has just lent on the keyboard for emphasis

        // so... extend the current token to include the rest of the hashtag
        Annotation newToken =
          lookups.get(lookups.add(token.getStartNode().getOffset(), endOffset,
            "HashtagLookup", Factory.newFeatureMap()));

        // return this extended token
        List<Annotation> found = new ArrayList<Annotation>();
        found.add(newToken);

        shortest.add(found);
        return shortest;
      }
    }

    // get all possibilities that start after the current annotation that
    // could be used to cover the next bit of the document
    List<Annotation> next =
      sort(getAnnotationsAtOffset(lookups, token.getEndNode().getOffset()));

    // if there aren't any annotations we can use then return null as we can
    // never fully cover the range from this point
    if(next == null || next.isEmpty()) return null;

    for(Annotation a : next) {
      // use each possible annotation to start a new search
      List<List<Annotation>> part = search(lookups, endOffset, a);

      if(part != null) {
        if(shortest.isEmpty()) {
          shortest.addAll(part);
        } else if(part.get(0).size() == shortest.get(0).size()) {
          shortest.addAll(part);
        } else if(part.get(0).size() < shortest.get(0).size()) {
          shortest.clear();
          shortest.addAll(part);
        }
      }
    }

    // if we didn't find a match then return null
    if(shortest.isEmpty()) return null;

    // add the token we started from to the beginning of the match
    for(List<Annotation> found : shortest) {
      found.add(0, token);
    }

    // return the match we found
    return shortest;
  }

  /**
   * Returns a list in which the Annotations are sorted by length, longest
   * first.
   **/
  private List<Annotation> sort(AnnotationSet annotations) {

    List<Annotation> sorted = new ArrayList<Annotation>();

    if(annotations == null || annotations.isEmpty()) return sorted;

    sorted.addAll(annotations);
    Collections.sort(sorted, lengthComparator);

    // TODO filter out annotations with the same span

    List<Annotation> filtered = new ArrayList<Annotation>();
    long length = -1;
    for(Annotation a : sorted) {
      long al = a.getEndNode().getOffset() - a.getStartNode().getOffset();
      if(length == -1 || al != length) {
        filtered.add(a);
        length = al;
      }
    }

    return filtered;
  }
}