GATE.ac.uk - releases/gate-8.4-build5748-ALL/plugins/Twitter/src/gate/twitter/Normaliser.java

/*
 * Normaliser.java
 *
 * Copyright (c) 2011-2013, The University of Sheffield. See the file COPYRIGHT.txt
 * in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * licenced under the GNU Library General Public License, Version 2, June 1991
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 *
 */

package gate.twitter;

import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPInputStream;

import gate.*;
import gate.creole.*;
import gate.creole.metadata.*;
import gate.util.Files;
import gate.util.GateRuntimeException;

import com.opencsv.CSVReader;

import pt.tumba.spell.*;

@CreoleResource(name = "Tweet Normaliser", comment = "Normalise texts in tweets (convert into standard English spelling mistakes, colloquialisms, typing variations and so on)", helpURL = "http://gate.ac.uk/userguide/sec:social:twitter:prs")
public class Normaliser extends AbstractLanguageAnalyser {

  private static final long serialVersionUID = -4139489923193104429L;

  @Override
  public Resource init() throws ResourceInstantiationException {

    if(this.orthURL == null)
      throw new ResourceInstantiationException("orth norm file not set");

    if(this.dictURL == null)
      throw new ResourceInstantiationException("dict file not set");

    checker = new SpellChecker();
    wordlist = new HashSet<String>();
    orthmappings = new HashMap<String, String>();

    // initialize spell checker
    try(InputStream in = openPossiblyGzip(dictURL);
        InputStreamReader inReader = new InputStreamReader(in, dictEncoding);
        BufferedReader dictReader = new BufferedReader(inReader)) {
      checker.initialize(dictReader);
    } catch(Exception e) {
      throw new ResourceInstantiationException("Error initializing spellchecker", e);
    }

    // read the wordlist
    try(InputStream in = openPossiblyGzip(dictURL);
        InputStreamReader inReader = new InputStreamReader(in, dictEncoding);
        BufferedReader dictReader = new BufferedReader(inReader)) {
      String entry;
      while((entry = dictReader.readLine()) != null) {
        String[] tokens = entry.split(" : ");
        wordlist.add(tokens[0]);
      }
    } catch(Exception e) {
      throw new ResourceInstantiationException(e);
    }

    // populate the common norm. lookup list
    try {
      readOrthMappings(orthURL);
    } catch(IOException e) {
      throw new ResourceInstantiationException("Error loading orth mappings", e);
    }

    dist = new LevenshteinDistance();

    return this;
  }

  /**
   * Read an orth mapping file.  This file may be a "definition" file giving a
   * list of other files to read, in which case each of those files will be
   * read recursively.
   */
  protected void readOrthMappings(URL url) throws IOException {
    try(InputStream in = openPossiblyGzip(url);
        InputStreamReader reader = new InputStreamReader(in, orthEncoding);
        CSVReader csvReader = new CSVReader(reader)) {
      String[] line = csvReader.readNext();
      if(line == null) return;

      if(line.length < 2) {
        // definition file pointing to other lists
        do {
          String relpath = line[0].trim();
          if(!"".equals(relpath)) {
            readOrthMappings(new URL(url, relpath));
          }
        } while((line = csvReader.readNext()) != null);
      } else {
        // single list
        do {
          orthmappings.put(line[0], line[1]);
        } while((line = csvReader.readNext()) != null);
      }
    }
  }

  protected InputStream openPossiblyGzip(URL url) throws IOException {
    InputStream in = url.openStream();
    // if the URL ends ".gz" assume it's compressed
    if(url.getPath().endsWith(".gz")) {
      in = new GZIPInputStream(in);
    }
    return in;
  }

  @Override
  public void execute() throws ExecutionException {

    /*
     * The goal is to correct in-vocab (IV) words to their normal English form,
     * and skip over out-of-vocab (OOV) terms
     * 
     * General process: Take the Input AS; per Sentence, per Token, read the
     * String feature look to see if it has a direct conversion - if so, do
     * that, done look to see if it has a spelling correcting candidate within
     * edit dist up to 2 - if so, convert, done look to see if it has a
     * doublemetaphone candidate within edit dist up to 2 - if so, convert, done
     * otherwise, assume OOV and not a mangled IV term; ignore
     */

    if(document == null)
      throw new ExecutionException("No document to process!");

    fireStatusChanged("Normalising " + document.getName());

    AnnotationSet inputAS = document.getAnnotations(inputASName);

    AnnotationSet tokensAS = inputAS.get(TOKEN_ANNOTATION_TYPE);
    List<Annotation> tokenList = new ArrayList<Annotation>(tokensAS);

    // if there are any annotations
    if(tokensAS != null && tokensAS.size() > 0) {
      Iterator<Annotation> tokensIter = tokenList.iterator();

      while(tokensIter.hasNext()) {

        Annotation ann = tokensIter.next();
        // System.out.println("--");
        // System.out.println(ann.toString());

        String kind = (String)ann.getFeatures().get("kind");

        // skip if kind != word - don't both correcting known NEs, punctuation,
        // and so on
        if(kind.equals("word")) {

          String initialText =
            (String)ann.getFeatures().get(initialTextFeature);

          // first: is it in our lookup list?
          // TODO: only trigger this if the tweet has a noise level above a
          // threshold, or based on language modelling
          String initialLower = initialText.toLowerCase();
          if(orthmappings.containsKey(initialLower)) {
            addFeatures(ann, origTextFeature, initialLower);
            addFeatures(ann, normTextFeature, orthmappings.get(initialLower));
            continue;
          }

          // skip words in the dictionary
          if(wordlist.contains(initialText)) {
            continue;
          }

          // check orthography as a quick proper noun filter
          String orth = "invalid";
          try {
            orth = (String)ann.getFeatures().get("orth");
          } catch(Exception e) {
            continue;
          }

          if(orth == null) {
            orth = "invalid";
          }

          // is it a proper noun (capitalised) or should be skipped?
          if((orth.equals("upperInitial")) || (orth.equals("invalid"))) {
            continue;
          }

          // System.out.println("kind: '" + kind + "'" + " orth: '" + orth +
          // "'");

          // we're going to try and do a full replacement
          // save original text
          addFeatures(ann, origTextFeature, initialText);

          // do mangling
          String normalisedText = initialText;

          // replacement checking starts here
          // second param true means take keyboard distances into account
          String mostSimilar = checker.findMostSimilar(initialText);

          // System.out.println(initialText + ":" + mostSimilar);

          if(mostSimilar != null) {

            // if most similar distance is below threshold, make the
            // substitution
            if(dist.modifiedLevenshteinDistance(initialText, mostSimilar) < maxDistance) {
              // don't bother just changing case - leave case as it is
              if(!initialText.toLowerCase()
                .equals(mostSimilar.toLowerCase())) {
                normalisedText = mostSimilar;
              }
            }

            // save normalised text
            addFeatures(ann, normTextFeature, normalisedText);
          }

        }

      }

    }

    fireProcessFinished();

  }

  protected void addFeatures(Annotation annot, String featureName,
                             String featureValue) throws GateRuntimeException {

    String tempIASN = inputASName == null ? "" : inputASName;
    String tempOASN = outputASName == null ? "" : outputASName;
    if(tempIASN.equals(tempOASN)) {
      annot.getFeatures().put(featureName, featureValue);
      return;
    } else {
      int start = annot.getStartNode().getOffset().intValue();
      int end = annot.getEndNode().getOffset().intValue();

      // get the annotations of type outputAnnotationType
      AnnotationSet outputAS =
        (outputASName == null) ? document.getAnnotations() : document
          .getAnnotations(outputASName);
      AnnotationSet annotations = outputAS.get(TOKEN_ANNOTATION_TYPE);
      if(annotations == null || annotations.size() == 0) {
        // add new annotation
        FeatureMap features = Factory.newFeatureMap();
        features.put(featureName, featureValue);
        try {
          outputAS.add(new Long(start), new Long(end), TOKEN_ANNOTATION_TYPE,
            features);
        } catch(Exception e) {
          throw new GateRuntimeException("Invalid Offsets");
        }
      } else {
        // search for the annotation if there is one with the same start and end
        // offsets
        ArrayList<Annotation> tempList =
          new ArrayList<Annotation>(annotations.get());
        boolean found = false;
        for(int i = 0; i < tempList.size(); i++) {
          Annotation annotation = tempList.get(i);
          if(annotation.getStartNode().getOffset().intValue() == start &&
            annotation.getEndNode().getOffset().intValue() == end) {
            // this is the one
            annotation.getFeatures().put(featureName, featureValue);
            found = true;
            break;
          }
        }

        if(!found) {
          // add new annotation
          FeatureMap features = Factory.newFeatureMap();
          features.put(featureName, featureValue);
          try {
            outputAS.add(new Long(start), new Long(end), TOKEN_ANNOTATION_TYPE,
              features);
          } catch(Exception e) {
            throw new GateRuntimeException("Invalid Offsets");
          }
        }
      }
    }
  }

  @Override
  public void reInit() throws ResourceInstantiationException {
    // reinitialization code
    init();
  }

  // getter and setter methods

  @RunTime
  @Optional
  @CreoleParameter(comment = "Input annotation set name", defaultValue = "")
  public void setInputASName(String inputASName) {
    this.inputASName = inputASName;
  }

  public String getInputASName() {
    return this.inputASName;
  }

  @RunTime
  @Optional
  @CreoleParameter(comment = "Output annotation set name", defaultValue = "")
  public void setOutputASName(String outputASName) {
    this.outputASName = outputASName;
  }

  public String getOutputASName() {
    return this.outputASName;
  }

  @RunTime
  @Optional
  @CreoleParameter(comment = "Feature on Token annotations in the input AS that contains the token string", defaultValue = TOKEN_STRING_FEATURE_NAME)
  public void setInitialTextFeature(String f) {
    this.initialTextFeature = f;
  }

  public String getInitialTextFeature() {
    return this.initialTextFeature;
  }

  @RunTime
  @Optional
  @CreoleParameter(comment = "Feature to which the normalised text should be saved", defaultValue = TOKEN_STRING_FEATURE_NAME)
  public void setNormTextFeature(String f) {
    this.normTextFeature = f;
  }

  public String getNormTextFeature() {
    return this.normTextFeature;
  }

  @RunTime
  @Optional
  @CreoleParameter(comment = "Feature to which the original text should be saved", defaultValue = "origString")
  public void setOrigTextFeature(String f) {
    this.origTextFeature = f;
  }

  public String getOrigTextFeature() {
    return this.origTextFeature;
  }

  @CreoleParameter(comment = "Path to JaSpell dictionary", defaultValue = "resources/normaliser/english.jaspell")
  public void setDictURL(URL dictURL) {
    this.dictURL = dictURL;
  }

  public URL getDictURL() {
    return this.dictURL;
  }

  @CreoleParameter(comment = "Character encoding used to read the dictionary file", defaultValue = "UTF-8")
  public void setDictEncoding(String encoding) {
    this.dictEncoding = encoding;
  }

  public String getDictEncoding() {
    return this.dictEncoding;
  }

  @CreoleParameter(comment = "Path to common normalisation terms list (for orthographic mappings, e.g. 'b4' to 'before').  "
      + "This can either be a single two-column CSV file where the first column is the term to be mapped and the second "
      + "column is the target, or a single column file listing relative paths to other lists (which is useful if you want "
      + "to include more than one list of terms)", defaultValue = "resources/normaliser/orth.en.csv")
  public void setOrthURL(URL orthURL) {
    this.orthURL = orthURL;
  }

  public URL getOrthURL() {
    return this.orthURL;
  }

  @CreoleParameter(comment = "Character encoding used to read the orth files", defaultValue = "UTF-8")
  public void setOrthEncoding(String encoding) {
    this.orthEncoding = encoding;
  }

  public String getOrthEncoding() {
    return this.orthEncoding;
  }

  @RunTime
  @Optional
  @CreoleParameter(comment = "Maximum distance to consider (this determines OOV/IV threshold).\nBased on Levenshtein edit dist (with a case change downweighted to 0.5) and double-metaphone.", defaultValue = "2.0")
  public void setMaxDistance(String maxDistance) {
    this.maxDistance = new Float(maxDistance).doubleValue();
  }

  public String getMaxDistance() {
    return new Float(this.maxDistance).toString();
  }

  protected HashMap<String, String> orthmappings;

  protected HashSet<String> wordlist;

  protected SpellChecker checker;

  protected LevenshteinDistance dist;

  private String outputASName;

  private String inputASName;

  private String initialTextFeature;

  private String normTextFeature;

  private String origTextFeature;

  private URL dictURL;

  private URL orthURL;

  private String dictEncoding;

  private String orthEncoding;

  private double maxDistance;

}