Log in Help
Print
Homereleasesgate-8.4-build5748-ALLpluginsDocumentNormalizersrcgatecreole 〉 DocumentNormalizer.java
 
/*
 * DocumentNoramlizaer.java
 *
 * Copyright (c) 2011-2013, The University of Sheffield.
 * 
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * Licensed under the GNU Library General Public License, Version 3, June 2007
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 *
 * Mark A. Greenwood, 10/11/2011
 */

package gate.creole;

import gate.Resource;
import gate.corpora.DocumentContentImpl;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.util.InvalidOffsetException;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@CreoleResource(name = "Document normalizer",
    comment = "Normalize document content to remove \"smart quotes\" etc.",
    helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:doc-normalizer")
public class DocumentNormalizer extends AbstractLanguageAnalyser {

  private static final long serialVersionUID = -6780562970645480555L;

  private List<Replacement> replacements = new ArrayList<Replacement>();

  private URL listURL;

  private String encoding;

  @CreoleParameter(defaultValue = "resources/replacements.lst",
    comment = "the file controlling the replacements to be made")
  public void setReplacementsURL(URL listURL) {
    this.listURL = listURL;
  }

  public URL getReplacementsURL() {
    return listURL;
  }

  @CreoleParameter(defaultValue = "UTF-8",
    comment = "The encoding of the replacements file")
  public void setEncoding(String encoding) {
    this.encoding = encoding;
  }

  public String getEncoding() {
    return encoding;
  }

  @Override
  public Resource init() throws ResourceInstantiationException {
    if(encoding == null)
      throw new ResourceInstantiationException("Encoding must be specified!");
    if(listURL == null)
      throw new ResourceInstantiationException(
              "URL of replacements file must be specified!");

    replacements.clear();

    try {
      BufferedReader in =
              new BufferedReader(new InputStreamReader(listURL.openStream(),
                      encoding));
      String from = in.readLine();
      while(from != null) {
        String to = in.readLine();

        if(to == null)
          throw new ResourceInstantiationException("Non-Matching Replacement!");

        replacements.add(new Replacement(Pattern.compile(from), to));

        from = in.readLine();
      }
    } catch(Exception e) {
      throw new ResourceInstantiationException(e);
    }

    return this;
  }

  @Override
  public void execute() throws ExecutionException {

    try {
      for(Replacement r : replacements) {

        String docContent = document.getContent().toString();

        Matcher m = r.from.matcher(docContent);

        String replacement = r.to;
        int rl = replacement.length();

        long offset = 0;

        while(m.find()) {

          long start = m.start() + offset;
          long end = m.end() + offset;

          document.edit(start, end, new DocumentContentImpl(replacement));

          offset += rl - (m.end() - m.start());

        }
      }
    } catch(InvalidOffsetException e) {
      throw new ExecutionException(e);
    }
  }

  private static class Replacement {
    protected Pattern from;

    protected String to;

    public Replacement(Pattern from, String to) {
      this.from = from;
      this.to = to;
    }

    @Override
    public String toString() {
      return from + " --> " + to;
    }
  }
}