Log in Help
Print
Homereleasesgate-8.4-build5748-ALLpluginsToolssrcgateconfigurableexporter 〉 ConfigurableExporter.java
 
/*
 *  ConfigurableExporter.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 *  Genevieve Gorrell, 23 Jul 2012
 *  
 *  $Id: ConfigurableExporter.java 16003 2012-08-09 11:03:12Z ggorrell $
 */

package gate.configurableexporter;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.ProcessingResource;
import gate.Resource;
import gate.Utils;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Serializable;
import java.util.Iterator;
import java.util.List;




/**
 *  Configurable Exporter takes a configuration file specifying the
 *  format of the output file. The configuration file consists of a
 *  single line specifying output format with annotation names 
 *  surrounded by three angle brackets. E.g.<br>
 *  
 *  <pre>
 *  {index}, {class}, "{content}"
 *  </pre>
 *  
 *  might result in an output file something like
 *  <pre>
 *  10000004, A, "Some text .."
 *  10000005, A, "Some more text .."
 *  10000006, B, "Further text .."
 *  10000007, B, "Additional text .."
 *  10000008, B, "Yet more text .."
 *  </pre>
 *  Annotation features can also be specified using dot notation,
 *  for example;
 *  <pre>
 *  {index}, {instance.class}, "{content}"
 *  </pre>
 *  The PR is useful for outputting data for use in machine learning,
 *  and so each line is considered an "instance". Instance is specified
 *  at run time, and by default is a document, but might be an
 *  annotation type. Instances are output one per line and the config
 *  file specifies how to output each instance. Annotations included
 *  in the output file are the first incidence of the specified type in
 *  the instance. If there is ever a need for it I might fix it so you
 *  can output more than one incidence of the same annotation type.
 *
 */
@CreoleResource(name = "Configurable Exporter", 
  comment = "Allows annotations to be exported according to a specified format.")
public class ConfigurableExporter extends AbstractLanguageAnalyser implements
                                                    ProcessingResource,
                                                    Serializable {

  /**
   * 
   */
  private static final long serialVersionUID = -7237223509897088067L;

  /**
   * The ConfigurableExporter configuration file.
   * 
   */
  private java.net.URL configFileURL;

  /**
   * The file to export the data to.
   * 
   */
  private java.net.URL outputURL;

  /**
   * The annotation set from which to draw the annotations.
   * 
   */
  private String inputASName;

  /**
   * The annotation type to be treated as instance.
   * 
   */
  private String instanceName;
  
  private int numberOfAnnotationSlots = -1;
  private int numberOfBridgeTextSlots = -1;
  private static int maxslotsno = 50;
  private String[][] annsToInsert = new String[maxslotsno][2];
  private String[] bridges = new String[maxslotsno];

  private PrintStream outputStream = System.out;

  @CreoleParameter(comment = "The configuration file specifying output format.",
      defaultValue="resources/configurableexporter/example.conf", 
      suffixes=".conf")
  public void setConfigFileURL(java.net.URL configFileURL) {
    this.configFileURL = configFileURL;
  }

  public java.net.URL getConfigFileURL() {
    return configFileURL;
  }


  @RunTime
  @Optional
  @CreoleParameter(comment = "The file to which data will be output. Leave " +
  		"blank for output to messages tab or standard out.")
  public void setOutputURL(java.net.URL output) {
    this.outputURL = output;
    outputStream = System.out;
    if(outputURL!=null){
    	try {
    		outputStream = new PrintStream(this.outputURL.getFile());
    	} catch (Exception e){
    		e.printStackTrace();
    	}
    }
  }

  public java.net.URL getOutputURL() {
    return this.outputURL;
  }


  @RunTime
  @Optional
  @CreoleParameter(comment = "The name for annotation set used as input to " +
  		"the exporter.")
  public void setInputASName(String iasn) {
    this.inputASName = iasn;
  }

  public String getInputASName() {
    return this.inputASName;
  }

  
  @RunTime
  @Optional
  @CreoleParameter(comment = "The annotation type to be treated as instance. " +
  		"Leave blank to use document as instance.")
  public void setInstanceName(String inst) {
    this.instanceName = inst;
  }

  public String getInstanceName() {
    return this.instanceName;
  }

  @Override
  public Resource init() throws ResourceInstantiationException {
    this.numberOfAnnotationSlots = -1;
    this.numberOfBridgeTextSlots = -1;
    this.annsToInsert = new String[maxslotsno][2];
    this.bridges = new String[maxslotsno];

    if(configFileURL == null) throw new IllegalArgumentException(
        "No value provided for the configFileURL parameter");
    
    String strLine = null;
    try {
      BufferedReader in =
          new BufferedReader(new InputStreamReader(configFileURL.openStream()));
      if((strLine = in.readLine()) != null) {
        int upto = 0;
        int thisSlot = 0;
        while(upto < strLine.length() && thisSlot < maxslotsno) {
          int startAnnoName = strLine.indexOf("{", upto);
          int endAnnoName = strLine.indexOf("}", upto);
          if(startAnnoName == -1 && endAnnoName == -1) {
            bridges[thisSlot] = strLine.substring(upto, strLine.length());
            this.numberOfBridgeTextSlots = thisSlot + 1;
            upto = strLine.length();
          } else if((startAnnoName == -1 && endAnnoName != -1)
              || (startAnnoName != -1 && endAnnoName == -1)) {
            throw new ResourceInstantiationException(
                "Failed to parse configuration file for Configurable " +
                "Exporter.");
          } else {
            bridges[thisSlot] = strLine.substring(upto, startAnnoName);
            String thisAnnToInsert =
                strLine.substring(startAnnoName + 1, endAnnoName);
            String[] annotationTypeAndFeature = thisAnnToInsert.split("\\.", 2);
            this.annsToInsert[thisSlot][0] = annotationTypeAndFeature[0];
            if(annotationTypeAndFeature.length == 2) {
              this.annsToInsert[thisSlot][1] = annotationTypeAndFeature[1];
            } else {
              this.annsToInsert[thisSlot][1] = null;
            }
            upto = endAnnoName + 1;
            this.numberOfAnnotationSlots = thisSlot + 1;
            this.numberOfBridgeTextSlots = thisSlot + 1;
          }
          thisSlot++;
        }
      }
    } catch(Exception e) {
      System.out.println("Failed to access configuration file.");
      e.printStackTrace();
    }
    return this;
  }

  @Override
  public void execute() throws ExecutionException {
    Document doc = getDocument();

    // Get the output annotation set
    AnnotationSet inputAS = null;
    if(inputASName == null || inputASName.equals("")) {
      inputAS = doc.getAnnotations();
    } else {
      inputAS = doc.getAnnotations(inputASName);
    }

    List<Annotation> instances = null;
    if(instanceName == null || instanceName.equals("")) {
      // There is no instance name so we will create one instance (line) per
      // document.
      // Here, we find the first annotation of the right type for each slot in
      // the config file.
      for(int i = 0; i < this.numberOfAnnotationSlots; i++) {
        this.outputStream.print(this.bridges[i]);
        // Check the annotation type isn't null
        if(annsToInsert[i][0] != null) {
          List<Annotation> typedAnnotations = Utils.inDocumentOrder(
              inputAS.get(this.annsToInsert[i][0]));
	 if(typedAnnotations.size() > 0) {
          Annotation annotationToPrint = typedAnnotations.get(0);
          if(this.annsToInsert[i][1] != null) {
            this.outputStream.print(annotationToPrint.getFeatures().get(
                this.annsToInsert[i][1]));
          } else {
            // We have no feature to print so we will just print the text
            long startNode = annotationToPrint.getStartNode().getOffset();
            long endNode = annotationToPrint.getEndNode().getOffset();
            String annotationText = "";
            try {
              annotationText =
                  doc.getContent().getContent(startNode, endNode).toString();
            } catch(Exception e) {
              e.printStackTrace();
            }
            this.outputStream.print(annotationText);
          }
	 }
        }
      }
      if(this.numberOfBridgeTextSlots > this.numberOfAnnotationSlots) {
        this.outputStream.print(this.bridges[this.numberOfBridgeTextSlots - 1]);
      }
      this.outputStream.println();
    } else {
      // We have an instance type so we will create one output line per
      // instance.
      instances = Utils.inDocumentOrder(inputAS.get(this.instanceName));
      
      Iterator<Annotation> instanceAnnotationsIterator = instances.iterator();
      while(instanceAnnotationsIterator.hasNext()) {
        Annotation thisInstanceAnnotation = instanceAnnotationsIterator.next();
        long startSearch = thisInstanceAnnotation.getStartNode().getOffset();
        long endSearch = thisInstanceAnnotation.getEndNode().getOffset();

        // Here, we find the first annotation of the right type within the span
        // of the
        // instance annotation for each slot in the config file.
        for(int i = 0; i < this.numberOfAnnotationSlots; i++) {
          this.outputStream.print(this.bridges[i]);
          List<Annotation> typedAnnotations = Utils.inDocumentOrder(
              inputAS.get(this.annsToInsert[i][0], startSearch, endSearch));
          if(typedAnnotations.size() > 0) {
            Annotation annotationToPrint = typedAnnotations.get(0);
            if(this.annsToInsert[i][1] != null) {
              this.outputStream.print(annotationToPrint.getFeatures().get(
                  this.annsToInsert[i][1]));
            } else {
              // We have no feature to print so we will just print the text
              long startNode = annotationToPrint.getStartNode().getOffset();
              long endNode = annotationToPrint.getEndNode().getOffset();
              String annotationText = "";
              try {
                annotationText =
                    doc.getContent().getContent(startNode, endNode).toString();
              } catch(Exception e) {
                e.printStackTrace();
              }
              this.outputStream.print(annotationText);
            }            
          }
        }
        if(this.numberOfBridgeTextSlots > this.numberOfAnnotationSlots) {
          this.outputStream
              .print(this.bridges[this.numberOfBridgeTextSlots - 1]);
        }
        this.outputStream.println();
      }
    }

  }
  

  @Override
  public synchronized void interrupt() {
    super.interrupt();
  }

}