GATE.ac.uk - releases/gate-6.0-build3764-ALL/plugins/Alignment/src/gate/alignment/gui/actions/impl/AlignmentCache.java

package gate.alignment.gui.actions.impl;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.alignment.Alignment;
import gate.alignment.AlignmentActionInitializationException;
import gate.alignment.AlignmentException;
import gate.alignment.gui.AlignmentTask;
import gate.alignment.gui.FinishedAlignmentAction;
import gate.alignment.gui.PUAPair;
import gate.alignment.gui.PreDisplayAction;
import gate.alignment.utils.Regex;
import gate.util.BomStrippingInputStreamReader;
import gate.util.GateRuntimeException;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Alignment Cache is used for automatically aligning units of alignment
 * that were automatically aligned.
 *
 * @author niraj
 *
 */
public class AlignmentCache implements PreDisplayAction,
                           FinishedAlignmentAction {

  /**
   * Dictionary is loaded in memory
   */
  private Map<String, SortedSet<String>> dictionary = null;

  /**
   * What feature to use to store in the dictionary.  User can provide his/her
   * own feature
   */
  private String featureToUse = "string";

  /**
   * Writer we use to write to the dictionary file.
   */
  private BufferedWriter bw = null;

  /**
   * Is the init called?
   */
  boolean initCalled = false;

  // name of the output file where to store the alignment cache
  private String outputFileName;

  /**
   * the init method
   */
  public void init(String[] args) throws AlignmentActionInitializationException {
    if(initCalled) return;
    initCalled = true;

    if(args.length > 1) {
      featureToUse = args[1];
    }

    outputFileName = args[0];
    if(dictionary == null) {
      if(args != null && args.length > 0) {
        try {
          dictionary = new HashMap<String, SortedSet<String>>();

          if(new File(args[0]).exists()) {
            BufferedReader br = new BomStrippingInputStreamReader(
                    new FileInputStream(new File(args[0])), "UTF-8");

            String line = br.readLine();
            while(line != null) {
              if(line.trim().length() == 0 || line.trim().startsWith("#")) {
                line = br.readLine();
                continue;
              }

              String[] words = line.split("\t");
              SortedSet<String> tgtEntries = dictionary.get(words[0].trim());
              if(tgtEntries == null) {
                tgtEntries = new TreeSet<String>(new Comparator<String>() {
                  public int compare(String s1, String s2) {
                    String[] s1Array = s1.split("[ ]+");
                    String[] s2Array = s2.split("[ ]+");
                    int diff = s2Array.length - s1Array.length;
                    if(diff == 0) return s2.compareTo(s1);
                    else return diff;
                  }
                });
                dictionary.put(words[0], tgtEntries);
              }

              tgtEntries.add(words[1]);
              line = br.readLine();
            }
            br.close();
          }
        }
        catch(IOException ioe) {
          throw new AlignmentActionInitializationException(ioe);
        }
      }
    }
  }

  public void executePreDisplayAction(PUAPair pair) throws AlignmentException {

    AlignmentTask task = pair.getAlignmentTask();
    Alignment alignment = task.getAlignment();

    List<Annotation> srcTokens = null;
    List<Annotation> tgtTokens = null;

    List<Annotation> srcSet = pair.getSourceUnitAnnotations();
    List<Annotation> tgtSet = pair.getTargetUnitAnnotations();

    srcTokens = new ArrayList<Annotation>(srcSet);
    Collections.sort(srcTokens, new gate.util.OffsetComparator());
    tgtTokens = new ArrayList<Annotation>(tgtSet);
    Collections.sort(tgtTokens, new gate.util.OffsetComparator());

    HashMap<String, Annotation> sstOffsets = new HashMap<String, Annotation>();
    HashMap<String, Annotation> senOffsets = new HashMap<String, Annotation>();
    HashMap<String, Annotation> tstOffsets = new HashMap<String, Annotation>();
    HashMap<String, Annotation> tenOffsets = new HashMap<String, Annotation>();

    String srcString = "";
    for(int i = 0; i < srcTokens.size(); i++) {
      Annotation annot = srcTokens.get(i);
      srcString += (i == 0 ? "" : " ");
      sstOffsets.put(srcString.length() + "", annot);
      srcString += getText(annot, task.getSrcDoc());
      senOffsets.put(srcString.length() + "", annot);
    }

    String tgtString = "";
    for(int i = 0; i < tgtTokens.size(); i++) {
      Annotation annot = tgtTokens.get(i);
      tgtString += (i == 0 ? "" : " ");
      tstOffsets.put(tgtString.length() + "", annot);
      tgtString += getText(annot, task.getTgtDoc());
      tenOffsets.put(tgtString.length() + "", annot);
    }

    outerfor: for(String s : dictionary.keySet()) {
      Pattern p = Pattern.compile(Regex.escape(s), Pattern.UNICODE_CASE);
      Matcher m = p.matcher(srcString);
      int startIndex = 0;
      int tStartIndex = 0;
      outer: while(m.find(startIndex)) {
        int start = m.start(0);
        int end = start + m.group(0).length();

        startIndex = end;
        // find the firstToken using start as a startOffset
        // find the lastToken using end as a endOffset
        Annotation aFirstToken = sstOffsets.get("" + start);
        if(aFirstToken == null) continue;

        Annotation aLastToken = senOffsets.get("" + end);
        if(aLastToken == null) continue;

        Set<String> tStrings = dictionary.get(s);
        if(tStrings == null || tStrings.isEmpty()) {
          System.out.println("Invalid entry in alignment cache:" + s);
          continue outerfor;
        }

        for(String t : tStrings) {
          Pattern tPat = Pattern.compile(Regex.escape(t), Pattern.UNICODE_CASE);
          Matcher tMat = tPat.matcher(tgtString);

          whileLoop: while(tMat.find(tStartIndex)) {
            int tStart = tMat.start(0);
            int tEnd = tStart + tMat.group(0).length();
            tStartIndex = tEnd;

            Annotation tFirstToken = tstOffsets.get("" + tStart);
            Annotation tLastToken = tenOffsets.get("" + tEnd);
            if(tFirstToken == null || tLastToken == null) {
              tStartIndex = tStart + 1;
              continue;
            }

            int sst = srcTokens.indexOf(aFirstToken);
            int sen = srcTokens.indexOf(aLastToken);

            int tst = tgtTokens.indexOf(tFirstToken);
            int ten = tgtTokens.indexOf(tLastToken);

            for(int k = sst; k <= sen; k++) {
              Annotation sAnnot = srcTokens.get(k);
              if(alignment.isAnnotationAligned(sAnnot)) continue whileLoop;
            }

            for(int k = tst; k <= ten; k++) {
              Annotation tAnnot = tgtTokens.get(k);
              if(alignment.isAnnotationAligned(tAnnot)) continue whileLoop;
            }

            for(int k = sst; k <= sen; k++) {
              Annotation sAnnot = srcTokens.get(k);
              for(int kk = tst; kk <= ten; kk++) {
                Annotation tAnnot = tgtTokens.get(kk);
                if(!alignment.areTheyAligned(sAnnot, tAnnot)) {
                  if(!alignment.isAnnotationAligned(sAnnot)) {
                    sAnnot.getFeatures().put(
                            Alignment.ALIGNMENT_METHOD_FEATURE_NAME,
                            "phraseAlignAction");
                  }

                  if(!alignment.isAnnotationAligned(tAnnot)) {
                    tAnnot.getFeatures().put(
                            Alignment.ALIGNMENT_METHOD_FEATURE_NAME,
                            "phraseAlignAction");
                  }

                  alignment.align(sAnnot, task.getSrcASName(),
                          task.getSrcDoc(), tAnnot, task.getTgtASName(), task
                                  .getTgtDoc());

                }
              }
            }
            continue outer;
          }
          tStartIndex = 0;
        }
      }
    }
  }

  private String getText(Annotation annot, Document document) {

    if(annot.getFeatures().containsKey(featureToUse)) {
      return annot.getFeatures().get(featureToUse).toString();
    }

    return document.getContent().toString().substring(
            annot.getStartNode().getOffset().intValue(),
            annot.getEndNode().getOffset().intValue()).toLowerCase();
  }

  public String getText(Set<Annotation> annotations, Document document,
          AnnotationSet set) {
    if(annotations == null || annotations.isEmpty()) return null;

    List<Annotation> annotsList = new ArrayList<Annotation>(annotations);
    Collections.sort(annotsList, new gate.util.OffsetComparator());
    String toReturn = "";
    List<Annotation> allAnnots = null;
    int indexOfAnnot = 0;

    for(int i = 0; i < annotsList.size(); i++) {
      Annotation annot = annotsList.get(i);
      if(i == 0) {
        allAnnots = new ArrayList<Annotation>(set.get(annot.getType()));
        Collections.sort(allAnnots, new gate.util.OffsetComparator());
        indexOfAnnot = allAnnots.indexOf(annot);
        if(annot.getFeatures().get(featureToUse) == null) {
          toReturn = document.getContent().toString().substring(
                  annot.getStartNode().getOffset().intValue(),
                  annot.getEndNode().getOffset().intValue());
        }
        else {
          toReturn = (String)annot.getFeatures().get(featureToUse);
        }
      }

      if(i != 0) {
        int tempIndex = allAnnots.indexOf(annot);
        if(indexOfAnnot + 1 != tempIndex) {
          toReturn += " [^ ]+ ";
        }
        else {
          toReturn += " ";
        }
        if(annot.getFeatures().get(featureToUse) == null) {
          toReturn += document.getContent().toString().substring(
                  annot.getStartNode().getOffset().intValue(),
                  annot.getEndNode().getOffset().intValue());
        }
        else {
          toReturn += (String)annot.getFeatures().get(featureToUse);
        }
        indexOfAnnot = tempIndex;
      }
    }
    return toReturn;
  }

  public void cleanup() {
    if(bw == null) {
      try {
        bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
                new File(outputFileName), false), "UTF-8"));
      }
      catch(UnsupportedEncodingException e) {
        throw new GateRuntimeException(e);
      }
      catch(FileNotFoundException e) {
        throw new GateRuntimeException(e);
      }
    }

    try {
      for(String s : dictionary.keySet()) {
        for(String t : dictionary.get(s)) {
          bw.write(s + "\t" + t);
          bw.newLine();
          bw.flush();
        }
      }
    }
    catch(IOException ioe) {
      throw new GateRuntimeException(ioe);
    }
    finally {
      if(bw != null) {
        try {
          bw.close();
        }
        catch(IOException e) {
          throw new GateRuntimeException(e);
        }
      }
    }
  }

  public String getToolTip() {
    return "Alignment Cache";
  }

  public void executeFinishedAlignmentAction(PUAPair pair)
          throws AlignmentException {

    // task
    AlignmentTask task = pair.getAlignmentTask();

    AnnotationSet sas = task.getSrcAS();
    AnnotationSet tas = task.getTgtAS();

    Alignment alignment = task.getAlignment();

    Set<Annotation> consideredAnnots = new HashSet<Annotation>();
    for(Annotation srcAnnot : pair.getSourceUnitAnnotations()) {
      if(consideredAnnots.contains(srcAnnot)) {
        continue;
      }

      Set<Annotation> srcAlignedAnnots = new HashSet<Annotation>();
      srcAlignedAnnots.add(srcAnnot);
      Set<Annotation> tgtAlignedAnnots = alignment
              .getAlignedAnnotations(srcAnnot);
      if(tgtAlignedAnnots == null || tgtAlignedAnnots.isEmpty()) {
        continue;
      }

      for(Annotation tgtAnnot : tgtAlignedAnnots) {
        consideredAnnots.add(tgtAnnot);
        srcAlignedAnnots.addAll(alignment.getAlignedAnnotations(tgtAnnot));
      }

      String sourceText = getText(srcAlignedAnnots, task.getSrcDoc(), sas);
      String targetText = getText(tgtAlignedAnnots, task.getTgtDoc(), tas);

      if(dictionary.containsKey(sourceText)) {
        if(dictionary.get(sourceText).contains(targetText)) {
          return;
        }
      }

      SortedSet<String> sourceTexts = dictionary.get(sourceText);
      if(sourceTexts == null) {
        sourceTexts = new TreeSet<String>(new Comparator<String>() {
          public int compare(String s1, String s2) {
            String[] s1Array = s1.split("[ ]+");
            String[] s2Array = s2.split("[ ]+");
            return s2Array.length - s1Array.length;
          }
        });
        dictionary.put(sourceText, sourceTexts);
      }
      sourceTexts.add(targetText);
    }
  }
}