Log in Help
Print
Homereleasesgate-5.1-build3431-ALLpluginsAlignmentsrcgatealignmentguiactionsimpl 〉 AlignmentCache.java
 
package gate.alignment.gui.actions.impl;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.alignment.Alignment;
import gate.alignment.AlignmentActionInitializationException;
import gate.alignment.AlignmentException;
import gate.alignment.gui.AlignmentEditor;
import gate.alignment.gui.PreDisplayAction;
import gate.alignment.gui.actions.impl.AbstractAlignmentAction;
import gate.compound.CompoundDocument;
import gate.util.GateRuntimeException;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Maintains alignment cache such that once aligned, user doesn't need to align 
 * it again.
 * @author Niraj Aswani
 *
 */
public class AlignmentCache extends AbstractAlignmentAction implements
                                                              PreDisplayAction {
  /**
   * Cache that remembers alignments done by the user.
   */
  private HashMap<String, SortedSet<String>> _cache = null;

  /**
   * feature to use for cache
   */
  private String featureToUse = "string";

  /**
   * Writer to write cache to a file
   */
  private BufferedWriter bw = null;

  int srcHighestLen = 1;

  int tgtHighestLen = 1;

  int srcMinLen = 10;

  int tgtMinLen = 10;

  /** args supplied in the actions.conf */
  String[] suppliedArgs = null;

  // is init called?
  boolean initCalled = false;

  /**
   * Init method called by the editor when alignment cache instance is created.
   */
  public void init(String[] args) throws AlignmentActionInitializationException {
    if(initCalled) return;
    initCalled = true;
    this.suppliedArgs = args;
    super.init(args);
    
    // has user provided a feature to use?
    if(args.length < 1) {
      throw new AlignmentActionInitializationException("expecting atleast one parameter: featureToUse");
    }
    
    featureToUse = args[0];
    
    // if user has provided a file with stored cache
    if(_cache == null) {
      _cache = new HashMap<String, SortedSet<String>>();
      
      // file where the cache is stored, if user has provided one
      if(args.length > 1) {
        try {
          //TODO this should be mapped to relative path
          if(new File(args[1]).exists()) {
            BufferedReader br =
                    new BufferedReader(new InputStreamReader(
                            new FileInputStream(new File(args[1])), "UTF-8"));

            String line = br.readLine();
            while(line != null) {
              if(line.trim().length() == 0 || line.trim().startsWith("#")) {
                line = br.readLine();
                continue;
              }

              String[] words = line.split("\t");
              SortedSet<String> tgtEntries = _cache.get(words[0].trim());
              if(tgtEntries == null) {
                tgtEntries = new TreeSet<String>(new Comparator<String>() {
                  public int compare(String s1, String s2) {
                    String[] s1Array = s1.split("[ ]+");
                    String[] s2Array = s2.split("[ ]+");
                    return s2Array.length - s1Array.length;
                  }
                });
                _cache.put(words[0], tgtEntries);
              }
              tgtEntries.add(words[1].trim());
              int srcLen = Integer.parseInt(words[2]);
              int tgtLen = Integer.parseInt(words[3]);
              if(srcLen > srcHighestLen) {
                srcHighestLen = srcLen;
              }

              if(srcMinLen > srcLen) {
                srcMinLen = srcLen;
              }

              if(tgtMinLen > tgtLen) {
                tgtMinLen = tgtLen;
              }

              if(tgtLen > tgtHighestLen) {
                tgtHighestLen = tgtLen;
              }

              line = br.readLine();
            }
            br.close();
          }
          
          // we'll rewrite this file at the end
          bw =
                  new BufferedWriter(new OutputStreamWriter(
                          new FileOutputStream(new File(args[1]), true),
                          "UTF-8"));
        } catch(IOException ioe) {
          throw new AlignmentActionInitializationException(ioe);
        }
      }
    }
  }

  private String getText(Annotation annot, Document document) {

    if(annot.getFeatures().containsKey(featureToUse)) { return annot
            .getFeatures().get(featureToUse).toString(); }

    return document.getContent().toString().substring(
            annot.getStartNode().getOffset().intValue(),
            annot.getEndNode().getOffset().intValue()).toLowerCase();
  }

  int lengthInNoOfWords = 1;

  public String getText(Set<Annotation> annotations, Document document,
          AnnotationSet set) {
    if(annotations == null || annotations.isEmpty()) return null;

    List<Annotation> annotsList = new ArrayList<Annotation>(annotations);
    Collections.sort(annotsList, new gate.util.OffsetComparator());
    String toReturn = "";
    List<Annotation> allAnnots = null;
    int indexOfAnnot = 0;

    lengthInNoOfWords = 0;

    for(int i = 0; i < annotsList.size(); i++) {
      lengthInNoOfWords++;
      Annotation annot = annotsList.get(i);
      if(i == 0) {
        allAnnots = new ArrayList<Annotation>(set.get(annot.getType()));
        Collections.sort(allAnnots, new gate.util.OffsetComparator());
        indexOfAnnot = allAnnots.indexOf(annot);
        if(annot.getFeatures().get(featureToUse) == null) {
          toReturn =
                  document.getContent().toString().substring(
                          annot.getStartNode().getOffset().intValue(),
                          annot.getEndNode().getOffset().intValue());
        } else {
          toReturn = (String)annot.getFeatures().get(featureToUse);
        }
      }

      if(i != 0) {
        int tempIndex = allAnnots.indexOf(annot);
        if(indexOfAnnot + 1 != tempIndex) {
          toReturn += " [^ ]+ ";
        } else {
          toReturn += " ";
        }
        if(annot.getFeatures().get(featureToUse) == null) {
          toReturn +=
                  document.getContent().toString().substring(
                          annot.getStartNode().getOffset().intValue(),
                          annot.getEndNode().getOffset().intValue());
        } else {
          toReturn += (String)annot.getFeatures().get(featureToUse);
        }
        indexOfAnnot = tempIndex;
      }
    }
    return toReturn;
  }

  public boolean invokeWithAlignAction() {
    return true;
  }

  public String getCaption() {
    return null;
  }

  public boolean invokeForAlignedAnnotation() {
    return false;
  }

  public boolean invokeForUnhighlightedUnalignedAnnotation() {
    return false;
  }

  public void cleanup() {
    if(bw != null) {
      try {
        bw.close();
      } catch(IOException ioe) {
        throw new GateRuntimeException(ioe);
      }
    }
  }

  public String getToolTip() {
    return "When user clicks on the Align button, "
            + "the selected annotations are recored in alignment cache"
            + " for future automatic correction.";
  }

  public void execute(AlignmentEditor editor, CompoundDocument document,
          Document srcDocument, String srcAS, Annotation srcAnnotation,
          Document tgtDocument, String tgtAS, Annotation tgtAnnotation)
          throws AlignmentException {

    // from it, obtain the alignment object
    Alignment alignment = document.getAlignmentInformation(editor.getAlignmentFeatureName());

    List<Annotation> srcTokens = null;
    List<Annotation> tgtTokens = null;

    AnnotationSet sAS = srcAS == null ? srcDocument.getAnnotations() : srcDocument.getAnnotations(srcAS);
    AnnotationSet tAS = srcAS == null ? tgtDocument.getAnnotations() : tgtDocument.getAnnotations(tgtAS);
    
    List<Annotation> srcSet = new ArrayList<Annotation>(sAS.getContained(srcAnnotation.getStartNode().getOffset(), srcAnnotation.getEndNode().getOffset()).get(editor.getSourceUnitOfAlignment()));
    List<Annotation> tgtSet = new ArrayList<Annotation>(tAS.getContained(tgtAnnotation.getStartNode().getOffset(), tgtAnnotation.getEndNode().getOffset()).get(editor.getTargetUnitOfAlignment()));

    srcTokens = new ArrayList<Annotation>(srcSet);
    Collections.sort(srcTokens, new gate.util.OffsetComparator());
    tgtTokens = new ArrayList<Annotation>(tgtSet);
    Collections.sort(tgtTokens, new gate.util.OffsetComparator());

    HashMap<String, Annotation> sstOffsets = new HashMap<String, Annotation>();
    HashMap<String, Annotation> senOffsets = new HashMap<String, Annotation>();
    HashMap<String, Annotation> tstOffsets = new HashMap<String, Annotation>();
    HashMap<String, Annotation> tenOffsets = new HashMap<String, Annotation>();

    String srcString = "";
    for(int i = 0; i < srcTokens.size(); i++) {
      Annotation annot = srcTokens.get(i);
      srcString += (i == 0 ? "" : " ");
      sstOffsets.put(srcString.length() + "", annot);
      srcString += getText(annot, srcDocument);
      senOffsets.put(srcString.length() + "", annot);
    }

    String tgtString = "";
    for(int i = 0; i < tgtTokens.size(); i++) {
      Annotation annot = tgtTokens.get(i);
      tgtString += (i == 0 ? "" : " ");
      tstOffsets.put(tgtString.length() + "", annot);
      tgtString += getText(annot, tgtDocument);
      tenOffsets.put(tgtString.length() + "", annot);
    }

    for(String s : _cache.keySet()) {
      List<Set<Annotation>> srcGroups = new ArrayList<Set<Annotation>>();
      List<Set<Annotation>> tgtGroups = new ArrayList<Set<Annotation>>();

      Pattern p = Pattern.compile(s, Pattern.UNICODE_CASE);
      Matcher m = p.matcher(srcString);
      
      int startIndex = 0;

      //there can be multiple matches
      whileLoop:while(m.find(startIndex)) {
        
        int start = m.start(0);
        int end = start + m.group(0).length();

        startIndex = end;
        // find the firstToken using start as a startOffset
        // find the lastToken using end as a endOffset
        Annotation aFirstToken = sstOffsets.get("" + start);
        if(aFirstToken == null) continue;

        Annotation aLastToken = senOffsets.get("" + end);
        if(aLastToken == null) continue;

        // if any of the tokens are aligned, continue with the next match
        int sst = srcTokens.indexOf(aFirstToken);
        int sen = srcTokens.indexOf(aLastToken);

        Set<Annotation> sGrp = new HashSet<Annotation>();
        
        for(int k = sst; k <= sen; k++) {
          Annotation sAnnot = srcTokens.get(k);
          if(alignment.isAnnotationAligned(sAnnot)) continue whileLoop;
          sGrp.add(sAnnot);
        }
        srcGroups.add(sGrp);
      }

      // multiple target strings for the source string
      Set<String> tStrings = _cache.get(s);
      for(String t : tStrings) {
        Pattern tPat = Pattern.compile(t, Pattern.UNICODE_CASE);
        Matcher tMat = tPat.matcher(tgtString);
        int tStartIndex = 0;
        
        whileLoop: while(tMat.find(tStartIndex)) {
          int tStart = tMat.start(0);
          int tEnd = tStart + tMat.group(0).length();
          tStartIndex = tEnd;

          Annotation tFirstToken = tstOffsets.get("" + tStart);
          Annotation tLastToken = tenOffsets.get("" + tEnd);
          if(tFirstToken == null || tLastToken == null) {
            tStartIndex = tStart + 1;
            continue whileLoop;
          }

          int tst = tgtTokens.indexOf(tFirstToken);
          int ten = tgtTokens.indexOf(tLastToken);
          
          Set<Annotation> tGrp = new HashSet<Annotation>();
          
          for(int k = tst; k <= ten; k++) {
            Annotation tAnnot = tgtTokens.get(k);
            if(alignment.isAnnotationAligned(tAnnot)) continue whileLoop;
            tGrp.add(tAnnot);
          }

          tgtGroups.add(tGrp);
        }
      }
      
      for(Set<Annotation> sGrp : srcGroups) {
        for(Set<Annotation> tGrp : tgtGroups) {
          for(Annotation sAnn : sGrp) {
            sAnn.getFeatures().put(Alignment.ALIGNMENT_METHOD_FEATURE_NAME, "_cache");
            for(Annotation tAnn : tGrp) {
              alignment.align(sAnn, srcAS, srcDocument, tAnn, tgtAS, tgtDocument);
              tAnn.getFeatures().put(Alignment.ALIGNMENT_METHOD_FEATURE_NAME, "_cache");
            }
          }
        }
      }
    }
    
  }

  public void execute(AlignmentEditor editor, CompoundDocument document,
          Document srcDocument, String srcAS,
          Set<Annotation> srcAlignedAnnotations, Document tgtDocument,
          String tgtAS, Set<Annotation> tgtAlignedAnnotations,
          Annotation clickedAnnotation) throws AlignmentException {

    if(srcAlignedAnnotations == null || srcAlignedAnnotations.isEmpty())
      return;
    if(tgtAlignedAnnotations == null || tgtAlignedAnnotations.isEmpty())
      return;

    AnnotationSet sAS = srcAS == null ? srcDocument.getAnnotations() : srcDocument.getAnnotations(srcAS);
    String sourceText = getText(srcAlignedAnnotations, srcDocument, sAS);
    int srcLen = lengthInNoOfWords;
    if(srcMinLen > srcLen) {
      srcMinLen = srcLen;
    }

    AnnotationSet tAS = srcAS == null ? tgtDocument.getAnnotations() : tgtDocument.getAnnotations(tgtAS);
    String targetText = getText(tgtAlignedAnnotations, tgtDocument, tAS);
    int tgtLen = lengthInNoOfWords;
    if(tgtMinLen > tgtLen) {
      tgtMinLen = tgtLen;
    }

    if(_cache.containsKey(sourceText)) {
      if(_cache.get(sourceText).contains(targetText)) { return; }
    }

    SortedSet<String> sourceTexts = _cache.get(sourceText);
    if(sourceTexts == null) {
      sourceTexts = new TreeSet<String>(new Comparator<String>() {
        public int compare(String s1, String s2) {
          String[] s1Array = s1.split("[ ]+");
          String[] s2Array = s2.split("[ ]+");
          return s2Array.length - s1Array.length;
        }
      });
      _cache.put(sourceText, sourceTexts);
    }
    if(!sourceTexts.contains(targetText)) {
      sourceTexts.add(targetText);  
      try {
        if(bw == null) {
          init(suppliedArgs);
        }
        bw.write(sourceText + "\t" + targetText + "\t" + srcLen + "\t" + tgtLen);
        bw.newLine();
        bw.flush();
      } catch(IOException ioe) {
        throw new AlignmentException(ioe);
      } catch(AlignmentActionInitializationException e) {
        throw new AlignmentException(e);
      }
    }
  }

}