Log in Help
Print
Homereleasesgate-8.4-build5748-ALLpluginsAlignmentsrcgatealignment 〉 ExportAlignmentPR.java
 
package gate.alignment;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.ProcessingResource;
import gate.Resource;
import gate.Utils;
import gate.compound.CompoundDocument;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.OffsetComparator;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * A PR to export alignment information in an xml file.
 * 
 * @author niraj
 * 
 */
@CreoleResource(comment = "A PR to export alignment information in an xml file.")
public class ExportAlignmentPR extends AbstractLanguageAnalyser implements
                                                               ProcessingResource {

  private static final long serialVersionUID = 4755458725235429653L;

  /**
   * Directory where the resulting files should be stored to.
   */
  private URL outputDirectory;

  /**
   * File object obtained from the outputDirectory
   */
  private File directory;

  /**
   * name of the document feature that has the information about
   * alignments of the alignment units.
   */
  private String unitAlignmentFeatureName;

  /**
   * name of the document feature that has the information about
   * alignments of the parent of the alignment units.
   */
  private String parentOfUnitOfAlignmentFeatureName;

  /**
   * annotation type that has been used as parent of unit of alignment.
   */
  private String parentOfUnitOfAlignment;

  /**
   * annotation type that has been used as unit of alignment.
   */
  private String unitOfAlignment;

  /**
   * id of the source document
   */
  private String sourceDocumentID;

  /**
   * id of the target document
   */
  private String targetDocumentID;

  /**
   * name of the annotation set which has annotations for the unit of
   * alignment and parent of unit of alignment
   */
  private String inputASName;

  /** Initialise this resource, and return it. */
  public Resource init() throws ResourceInstantiationException {
    return this;

  }

  /* this method is called to re-initialise the resource */
  public void reInit() throws ResourceInstantiationException {

    // re-initialisation code
    init();
  }

  /**
   * Internal class to store information about a pair of parent of unit
   * of alignment.
   * 
   * @author niraj
   * 
   */
  class PUPair {
    /**
     * Source annotations
     */
    Set<Annotation> srcAnnots = new HashSet<Annotation>();

    /**
     * Target annotations.
     */
    Set<Annotation> tgtAnnots = new HashSet<Annotation>();
  }

  /**
   * Called when user clicks on the execute button. The main logic of
   * the PR.
   */
  public void execute() throws ExecutionException {

    // check if the right document provided
    if(!(this.document instanceof CompoundDocument)) {
      throw new ExecutionException(this.document.getName()
              + "not instance of CompoundDocument");
    }

    // checking for the output directory. if doesn't exist, it is
    // created
    try {
      directory = new File(outputDirectory.toURI());

      if(!directory.exists()) {
        directory.mkdirs();
      }
      else if(!directory.isDirectory()) {
        throw new ExecutionException(outputDirectory.toString()
                + "does not refer to a directory");
      }
    }
    catch(URISyntaxException e) {
      throw new ExecutionException(e);
    }

    // document is a compound document
    CompoundDocument cd = (CompoundDocument)this.document;

    // writer to create an xml
    BufferedWriter bw = null;

    // name of the file is the name of the document
    try {
      String fileName = this.document.getName();
      File fileToOutput = new File(directory, fileName);

      // if file is already there, create a new one
      if(fileToOutput.exists()) {
        fileToOutput = new File(directory, fileName
                + ("" + Math.random()).substring(2, 5));
      }

      bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
              fileToOutput), "UTF-8"));
    }
    catch(UnsupportedEncodingException e) {
      throw new ExecutionException(e);
    }
    catch(FileNotFoundException e) {
      throw new ExecutionException(e);
    }

    // obtain the source document
    Document srcDoc = cd.getDocument(sourceDocumentID);
    AnnotationSet srcAS = (inputASName == null || inputASName.trim().length() == 0)
            ? srcDoc.getAnnotations()
            : srcDoc.getAnnotations(inputASName);

    // obtain the target document
    Document tgtDoc = cd.getDocument(targetDocumentID);
    AnnotationSet tgtAS = (inputASName == null || inputASName.trim().length() == 0)
            ? tgtDoc.getAnnotations()
            : tgtDoc.getAnnotations(inputASName);

    // parent of unit of alignment is optional. if user hasn't provided
    // it
    // use the document as the parent instead.
    AnnotationSet parentSrcAS = srcAS;
    AnnotationSet parentTgtAS = tgtAS;
    boolean docAsParent = true;

    if(parentOfUnitOfAlignment != null
            && parentOfUnitOfAlignment.trim().length() != 0) {
      parentSrcAS = srcAS.get(parentOfUnitOfAlignment);
      parentTgtAS = tgtAS.get(parentOfUnitOfAlignment);
      docAsParent = false;
    }

    // this is where we store pairs of parent annotations
    List<PUPair> puPairs = new ArrayList<PUPair>();

    // if parentOfUnitOfAlignmentFeatureName is not provided, we
    // consider annotations in sequence
    if(parentOfUnitOfAlignmentFeatureName == null
            || parentOfUnitOfAlignmentFeatureName.trim().length() == 0) {

      if(!docAsParent) {
        List<Annotation> srcAnnots = new ArrayList<Annotation>(parentSrcAS);
        List<Annotation> tgtAnnots = new ArrayList<Annotation>(parentTgtAS);
        Collections.sort(srcAnnots, new OffsetComparator());
        Collections.sort(tgtAnnots, new OffsetComparator());

        for(int i = 0; i < srcAnnots.size() && i < tgtAnnots.size(); i++) {
          PUPair pair = new PUPair();
          pair.srcAnnots.add(srcAnnots.get(i));
          pair.tgtAnnots.add(tgtAnnots.get(i));
          puPairs.add(pair);
        }
      }
      else {
        // empty one
        puPairs.add(new PUPair());
      }
    }
    else {

      Alignment puAlignment = cd
              .getAlignmentInformation(parentOfUnitOfAlignmentFeatureName);
      Set<Annotation> annots = puAlignment.getAlignedAnnotations();
      annots.retainAll(parentSrcAS);

      for(Annotation srcPU : annots) {

        Set<Annotation> tgtPUs = puAlignment.getAlignedAnnotations(srcPU);
        Set<Annotation> srcPUs = new HashSet<Annotation>();

        for(Annotation tgtPU : tgtPUs) {
          srcPUs.addAll(puAlignment.getAlignedAnnotations(tgtPU));
        }

        PUPair pair = new PUPair();
        pair.srcAnnots.addAll(srcPUs);
        pair.tgtAnnots.addAll(tgtPUs);
        puPairs.add(pair);
      }
    }

    // create a file
    StringBuilder xml = new StringBuilder();
    xml = xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");

    xml = xml.append("<Document>\n");

    // one pair at a time
    for(PUPair puPair : puPairs) {

      xml = xml.append("<PUPair>\n");
      Set<Annotation> srcUAnnots = new HashSet<Annotation>();
      Set<Annotation> tgtUAnnots = new HashSet<Annotation>();

      xml = xml.append("\n<Source>\n");
      if(puPair.srcAnnots.isEmpty()) {
        // first source annotations
        AnnotationSet srcUAnnotsAS = srcAS.get(unitOfAlignment);
        List<Annotation> srcUAnnotsList = new ArrayList<Annotation>(
                srcUAnnotsAS);
        Collections.sort(srcUAnnotsList, new OffsetComparator());
        for(Annotation srcUAnnot : srcUAnnotsList) {
          xml = xml.append("<" + unitOfAlignment + " id=\""
                  + srcUAnnot.getId().intValue() + "\">");
          xml = xml.append(Utils.stringFor(srcDoc, srcUAnnot));
          xml = xml.append("</" + unitOfAlignment + ">");
        }
        srcUAnnots.addAll(srcUAnnotsList);
      }
      else {
        for(Annotation puAnnot : puPair.srcAnnots) {
          xml = xml.append("<" + parentOfUnitOfAlignment + ">");

          // first source annotations
          AnnotationSet srcUAnnotsAS = srcAS.getContained(
                  puAnnot.getStartNode().getOffset(),
                  puAnnot.getEndNode().getOffset()).get(unitOfAlignment);
          List<Annotation> srcUAnnotsList = new ArrayList<Annotation>(
                  srcUAnnotsAS);
          Collections.sort(srcUAnnotsList, new OffsetComparator());
          for(Annotation srcUAnnot : srcUAnnotsList) {
            xml = xml.append("<" + unitOfAlignment + " id=\""
                    + srcUAnnot.getId().intValue() + "\">");
            xml = xml.append(Utils.stringFor(srcDoc, srcUAnnot));
            xml = xml.append("</" + unitOfAlignment + ">");
          }
          srcUAnnots.addAll(srcUAnnotsList);
          xml = xml.append("</" + parentOfUnitOfAlignment + ">");
        }
      }
      xml = xml.append("\n</Source>\n");

      xml = xml.append("\n<Target>\n");
      // then target annots
      if(puPair.tgtAnnots.isEmpty()) {
        AnnotationSet tgtUAnnotsAS = tgtAS.get(unitOfAlignment);
        List<Annotation> tgtUAnnotsList = new ArrayList<Annotation>(
                tgtUAnnotsAS);
        Collections.sort(tgtUAnnotsList, new OffsetComparator());
        for(Annotation tgtUAnnot : tgtUAnnotsList) {
          xml = xml.append("<" + unitOfAlignment + " id=\""
                  + tgtUAnnot.getId().intValue() + "\">");
          xml = xml.append(Utils.stringFor(tgtDoc, tgtUAnnot));
          xml = xml.append("</" + unitOfAlignment + ">");
        }
        tgtUAnnots.addAll(tgtUAnnotsList);
      }
      else {
        for(Annotation puAnnot : puPair.tgtAnnots) {
          xml = xml.append("<" + parentOfUnitOfAlignment + ">");
          // first source annotations
          AnnotationSet tgtUAnnotsAS = tgtAS.getContained(
                  puAnnot.getStartNode().getOffset(),
                  puAnnot.getEndNode().getOffset()).get(unitOfAlignment);
          List<Annotation> tgtUAnnotsList = new ArrayList<Annotation>(
                  tgtUAnnotsAS);
          Collections.sort(tgtUAnnotsList, new OffsetComparator());
          for(Annotation tgtUAnnot : tgtUAnnotsList) {
            xml = xml.append("<" + unitOfAlignment + " id=\""
                    + tgtUAnnot.getId().intValue() + "\">");
            xml = xml.append(Utils.stringFor(tgtDoc, tgtUAnnot));
            xml = xml.append("</" + unitOfAlignment + ">");
          }
          tgtUAnnots.addAll(tgtUAnnotsList);
          xml = xml.append("</" + parentOfUnitOfAlignment + ">");
        }
      }
      xml = xml.append("\n</Target>\n");

      // its time for providing alignment information
      xml = xml.append("\n<Alignment>\n");

      Alignment uAlignment = cd
              .getAlignmentInformation(unitAlignmentFeatureName);

      for(Annotation srcUAnnot : srcUAnnots) {

        Set<Annotation> tgtAlignedAnnots = uAlignment
                .getAlignedAnnotations(srcUAnnot);
        if(tgtAlignedAnnots == null || tgtAlignedAnnots.isEmpty()) {
          continue;
        }

        Set<Annotation> srcAlignedAnnots = new HashSet<Annotation>();
        for(Annotation tgtUAnnot : tgtAlignedAnnots) {
          srcAlignedAnnots.addAll(uAlignment.getAlignedAnnotations(tgtUAnnot));
        }

        srcAlignedAnnots.retainAll(srcUAnnots);
        tgtAlignedAnnots.retainAll(tgtUAnnots);

        for(Annotation srcAA : srcAlignedAnnots) {
          for(Annotation tgtAA : tgtAlignedAnnots) {
            xml = xml.append("<Alignment source=\"" + srcAA.getId().intValue()
                    + "\" target=\"" + tgtAA.getId().intValue() + "\"/>\n");
          }
        }
      }

      xml = xml.append("\n</Alignment>\n");
      xml = xml.append("\n</PUPair>\n");
    }
    xml = xml.append("</Document>");

    try {
      bw.write(xml.toString());
      bw.close();
    }
    catch(IOException ioe) {
      throw new ExecutionException(ioe);
    }
  }

  public URL getOutputDirectory() {
    return outputDirectory;
  }

  @RunTime
  @CreoleParameter
  public void setOutputDirectory(URL outputFile) {
    this.outputDirectory = outputFile;
  }

  public String getParentOfUnitOfAlignment() {
    return parentOfUnitOfAlignment;
  }

  @RunTime
  @Optional
  @CreoleParameter(defaultValue = "Sentence")
  public void setParentOfUnitOfAlignment(String parentOfUnitOfAlignment) {
    this.parentOfUnitOfAlignment = parentOfUnitOfAlignment;
  }

  public String getUnitOfAlignment() {
    return unitOfAlignment;
  }

  @RunTime
  @CreoleParameter(defaultValue = "Token")
  public void setUnitOfAlignment(String unitOfAlignment) {
    this.unitOfAlignment = unitOfAlignment;
  }

  public String getUnitAlignmentFeatureName() {
    return unitAlignmentFeatureName;
  }

  @RunTime
  @CreoleParameter(defaultValue = "word-alignment")
  public void setUnitAlignmentFeatureName(String unitAlignmentFeatureName) {
    this.unitAlignmentFeatureName = unitAlignmentFeatureName;
  }

  public String getParentOfUnitOfAlignmentFeatureName() {
    return parentOfUnitOfAlignmentFeatureName;
  }

  @RunTime
  @Optional
  @CreoleParameter(defaultValue = "sentence-alignment")
  public void setParentOfUnitOfAlignmentFeatureName(
          String parentOfUnitOfAlignmentFeatureName) {
    this.parentOfUnitOfAlignmentFeatureName = parentOfUnitOfAlignmentFeatureName;
  }

  public String getSourceDocumentID() {
    return sourceDocumentID;
  }

  @RunTime
  @CreoleParameter
  public void setSourceDocumentID(String sourceDocumentID) {
    this.sourceDocumentID = sourceDocumentID;
  }

  public String getTargetDocumentID() {
    return targetDocumentID;
  }

  @RunTime
  @CreoleParameter
  public void setTargetDocumentID(String targetDocumentID) {
    this.targetDocumentID = targetDocumentID;
  }

  public String getInputASName() {
    return inputASName;
  }

  @RunTime
  @Optional
  @CreoleParameter(defaultValue = "")
  public void setInputASName(String inputASName) {
    this.inputASName = inputASName;
  }
}