GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Parser

package gate.stanford;

import edu.stanford.nlp.parser.lexparser.*;
import edu.stanford.nlp.trees.*;
import gate.*;
import gate.util.*;
import gate.creole.*;
import gate.creole.metadata.*;

import java.io.*;
import java.net.*;
import java.util.*;

import edu.stanford.nlp.ling.*;

/**
 * GATE PR wrapper around the Stanford Parser. This class expects to find Token
 * and Sentence annotations (such as those created by the ANNIE tokenizer and
 * splitter) already in the inputAS and transforms them into suitable data
 * structures, which it feeds to the LexicalizedParser. The parser's output can
 * be stored in the outputAS in various ways, controlled by CREOLE run-time
 * parameters.
 * @author adam
 */
@CreoleResource(name = "StanfordParser", comment = "Stanford parser wrapper",
        helpURL = "http://gate.ac.uk/userguide/sec:parsers:stanford")
public class Parser extends AbstractLanguageAnalyser 
implements ProcessingResource {

  private static final long serialVersionUID = -3062171258011850283L;

  protected edu.stanford.nlp.parser.lexparser.LexicalizedParser stanfordParser;

  /* Type "SyntaxTreeNode" with feature "cat" is compatible with the 
   * classic SyntaxTreeViewer.  */
  private static final String OUTPUT_PHRASE_TYPE   = "SyntaxTreeNode" ;
  private static final String PSG_TAG_FEATURE      = "cat" ;
  
  /* But "category" feature is compatible with the ANNIE POS tagger.  */
  private static final String  POS_TAG_FEATURE    = ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME;
  private static final String  inputSentenceType  = ANNIEConstants.SENTENCE_ANNOTATION_TYPE;
  private static final String  inputTokenType     = ANNIEConstants.TOKEN_ANNOTATION_TYPE;

  private static final String DEP_ANNOTATION_TYPE   = "Dependency";
  private static final String DEP_ARG_FEATURE       = "args";
  private static final String DEP_LABEL_FEATURE     = "kind"; 

  protected String                         annotationSetName;
  protected AnnotationSet                  annotationSet;
  protected gate.Document                  document;
  private   URL                            parserFile;
  protected boolean                        debugMode;
  private   boolean                        reusePosTags;

  private OffsetComparator                 offsetComparator;
  private Map<String, String>              tagMap;
  protected StanfordSentence               stanfordSentence;
  protected GrammaticalStructureFactory gsf;
  

  /*  CREOLE parameters for optional mapping  */
  private boolean                          useMapping = false; 
  private URL                              mappingFileURL;
  
  /*  internal variables for mapping  */
  private File                             mappingFile;
  private boolean                          mappingLoaded = false;
  
  /*  CREOLE parameters: what are we going to annotate, and how?  */
  private boolean                          addConstituentAnnotations;
  private boolean                          addDependencyFeatures;
  private boolean                          addDependencyAnnotations;
  private boolean                          addPosTags;


  

  /**
   * The {@link TreebankLangParserParams} implementation to use. This is
   * where we get the language pack, and then the
   * {@link GrammaticalStructureFactory} used to extract the
   * dependencies from the parse. In most cases you should leave this at
   * the default value, which is suitable for English text.
   */
  protected String tlppClass =
          "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams";

  public String getTlppClass() {
    return tlppClass;
  }

  @CreoleParameter(comment = "Class name of the TreebankLangParserParams "
      + "implementation used to extract the dependencies",
      defaultValue = 
          "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams")
  public void setTlppClass(String tlppClass) {
    this.tlppClass = tlppClass;
  }

  /**
   * The name of the feature to add to tokens. The feature value is a
   * {@link List} of {@link DependencyRelation} objects giving the
   * dependencies from this token to other tokens.
   */
  protected String dependenciesFeature = "dependencies";



  /**
   * Parse the current document.  (This is the principal 
   * method called by a CorpusController.)
   */
  public void execute() throws ExecutionException {
    annotationSet  = convertASName(annotationSetName);
    
    if (debugMode) {
      System.out.println("Parsing document: " + document.getName());
    }

    if (useMapping && (! mappingLoaded) ) {
      System.err.println("Warning: no mapping loaded!");
    }
    
    
    if (addConstituentAnnotations || addDependencyFeatures || addDependencyAnnotations) {
      parseSentences();
    }
    else {
      System.err.println("There is nothing for the parser to do.");
      System.err.println("Please enable at least one of the \"add...\" options.");
    }
  }

  
  /**
   * Initialize the Parser resource.  In particular, load the trained data
   * file.
   */
  public Resource init() throws ResourceInstantiationException {
    instantiateStanfordParser();
    if (mappingFile != null) {
      loadTagMapping(mappingFile);
    }
    offsetComparator = new OffsetComparator();
    super.init();
    
    if(tlppClass == null || tlppClass.equals("")) {
      throw new ResourceInstantiationException(
              "TLPP class name must be specified");
    }
    try {
      Class<?> tlppClassObj =
              Class.forName(tlppClass, true, Gate.getClassLoader());
      if(!TreebankLangParserParams.class.isAssignableFrom(tlppClassObj)) {
        throw new ResourceInstantiationException(tlppClassObj
                + " does not implement "
                + TreebankLangParserParams.class.getName());
      }
      TreebankLangParserParams tlpp =
              TreebankLangParserParams.class.cast(tlppClassObj.newInstance());
      gsf = tlpp.treebankLanguagePack().grammaticalStructureFactory();
    }
    catch(ClassNotFoundException e) {
      throw new ResourceInstantiationException("Class " + tlppClass
              + " not found", e);
    }
    catch(InstantiationException e) {
      throw new ResourceInstantiationException("Error creating TLPP object", e);
    }
    catch(IllegalAccessException e) {
      throw new ResourceInstantiationException("Error creating TLPP object", e);
    }
    return this;
  }


  /**
   * Re-initialize the Parser resource.  In particular, reload the trained
   * data file.
   */
  @Override public void reInit() throws ResourceInstantiationException {
    init();
  }  



  /**
   * Find all the Sentence annotations and iterate through them, parsing one
   * sentence at a time and storing the result in the output AS. (Sentences are
   * scanned for Tokens. You have to run the ANNIE tokenizer and splitter before
   * this PR.)
   */
  @SuppressWarnings("unchecked")
  private void parseSentences() { 
    List<Annotation> sentences = new ArrayList<Annotation>(annotationSet.get(inputSentenceType));
    java.util.Collections.sort(sentences, offsetComparator);
    Iterator<Annotation> sentenceIter = sentences.iterator();

    Tree tree;
    int debugNbrS, debugS;
    debugS = 0;
    debugNbrS = sentences.size();

    while (sentenceIter.hasNext()) {
      debugS++;
      tree = parseOneSentence(sentenceIter.next(), debugS);
      
      // Here null is the result from an empty Sentence.
      if (tree != null) {
        if (addConstituentAnnotations || addPosTags) {
          annotatePhraseStructureRecursively(tree, tree);
        }

        if (addDependencyFeatures || addDependencyAnnotations) {
          annotateDependencies(tree);
        }

        if (debugMode) {
          System.out.println("Parsed sentence " + debugS + " of " + debugNbrS);
        }
      }
      
      else if (debugMode) {
        System.out.println("Ignored empty sentence " + debugS + " of " + debugNbrS);
      }
      
    }
  }


  /**
   * Generate the special data structure for one sentence and pass the List of
   * Word to the parser.
   * 
   * @param sentence
   *          the Sentence annotation
   * @param s
   *          sentence number of debugging output
   * @param ofS
   *          total number of sentences for debugging output
   * @return  null if the sentence is empty
   */
  private Tree parseOneSentence(Annotation sentence, int sentenceNo) {
    Tree result = null;
    
    stanfordSentence = new StanfordSentence(sentence, inputTokenType, annotationSet, reusePosTags);

    /* Ignore an empty Sentence (sometimes the regex splitter can create one
     * with no Token annotations in it).
     */
    if ( stanfordSentence.isNotEmpty() ) {
      List<Word> wordList = stanfordSentence.getWordList();

      if (reusePosTags) {
        int nbrMissingTags = stanfordSentence.numberOfMissingPosTags();
        if (nbrMissingTags > 0)  {
          double percentMissing = Math.ceil(100.0 * ((float) nbrMissingTags) /
                  ((float) stanfordSentence.numberOfTokens()) );
          System.err.println("Warning (sentence " + sentenceNo + "): " + (int) percentMissing 
                  + "% of the Tokens are missing POS tags." );
        }
      }

      stanfordParser.parse(wordList);
      result = stanfordParser.getBestParse();
    }
    
    return result;
  }


  /**
   * Generate a SyntaxTreeNode Annotation corresponding to this Tree.  Work 
   * recursively so that the annotations are actually generated from the 
   * bottom up, in order to build the consists list of annotation IDs.
   * 
   * @param tree  the current subtree
   * @param rootTree  the whole sentence, used to find the span of the current subtree
   * @return a GATE Annotation of type "SyntaxTreeNode"
   */
  protected Annotation annotatePhraseStructureRecursively(Tree tree, Tree rootTree) {
    Annotation annotation = null;
    Annotation child;
    String label   = tree.value();

    List<Tree> children = tree.getChildrenAsList();

    if (children.size() == 0) {
      return null;
    }
    /* implied else */

    /* following line generates ClassCastException
     * 		IntPair span = tree.getSpan();
     * edu.stanford.nlp.ling.CategoryWordTag
     * at edu.stanford.nlp.trees.Tree.getSpan(Tree.java:393)
     * but I think it's a bug in the parser, so I'm hacking 
     * around it as follows. */
    int startPos = Trees.leftEdge(tree, rootTree);
    int endPos   = Trees.rightEdge(tree, rootTree);
    
    Long startNode = stanfordSentence.startPos2offset(startPos);
    Long endNode   = stanfordSentence.endPos2offset(endPos);

    List<Integer> consists = new ArrayList<Integer>();

    Iterator<Tree> childIter = children.iterator();
    while (childIter.hasNext()) {
      child = annotatePhraseStructureRecursively(childIter.next(), rootTree);
      if  ( (child != null)  &&
        (! child.getType().equals(inputTokenType) )) {
        consists.add(child.getId());
      }
    }
    annotation = annotatePhraseStructureConstituent(startNode, endNode, label, consists, tree.depth());

    return annotation;
  }



  /**
   * Record one constituent as an annotation.
   * 
   * @param startOffset
   * @param endOffset
   * @param label
   * @param consists
   * @param depth
   * @return
   */
  private Annotation annotatePhraseStructureConstituent(Long startOffset, Long endOffset, String label, 
    List<Integer> consists, int depth) {
    Annotation phrAnnotation = null;
    Integer phrID;

    try {
      String cat;
      if (useMapping && mappingLoaded) {
        cat  = translateTag(label);
      }
      else {
        cat = label; 
      }
      
      if (addConstituentAnnotations) {
        String text = document.getContent().getContent(startOffset, endOffset).toString();
        FeatureMap fm = gate.Factory.newFeatureMap();
        fm.put(PSG_TAG_FEATURE, cat);
        fm.put("text", text);

        /* Ignore empty list features on the token-equivalent annotations. */
        if (consists.size() > 0) {
          fm.put("consists", consists);
        }

        phrID  = annotationSet.add(startOffset, endOffset, OUTPUT_PHRASE_TYPE, fm);
        phrAnnotation = annotationSet.get(phrID);
        recordID(annotationSet, phrID);
      }

      if ( addPosTags && (depth == 1) ) {
        /* Expected to be a singleton set! */
        AnnotationSet tokenSet = annotationSet.get(inputTokenType, startOffset, endOffset);
        if (tokenSet.size() == 1) {
          Annotation token = tokenSet.iterator().next();

          /* Add POS tag to token.  
           * (Note: GATE/Hepple uses "(" and ")" for Penn/Stanford's
           * "-LRB-" and "-RRB-". */
          String hepCat = StanfordSentence.unescapePosTag(cat);
          token.getFeatures().put(POS_TAG_FEATURE, hepCat);
          
        }
        else {
          System.err.println("Found a tokenSet with " + tokenSet.size() + " members!");
        }
      }
    }
    catch (InvalidOffsetException e) {
      e.printStackTrace();
    }
    
    return phrAnnotation;
  }

  
  
  @SuppressWarnings("unchecked")
  private void annotateDependencies(Tree tree) {
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> deps = gs.typedDependencies();
    String dependencyKind;
    FeatureMap depFeatures;
    Integer dependentTokenID, governorTokenID;
    List<Integer> argList;
    Long offsetLH0, offsetRH0, offsetLH1, offsetRH1, depLH, depRH;
    Annotation governorToken, dependentToken;

    for(TypedDependency d : deps) {
      if(debugMode) {
        System.out.println(d);
      }

      int governorIndex = ((HasIndex) d.gov().label()).index() - 1;
      governorToken  = stanfordSentence.startPos2token(governorIndex);
      
      int dependentIndex = ((HasIndex) d.dep().label()).index() - 1;
      dependentToken = stanfordSentence.startPos2token(dependentIndex);

      dependencyKind = d.reln().toString();
      governorTokenID = governorToken.getId();
      dependentTokenID = dependentToken.getId();

      if (addDependencyFeatures) {
        List<DependencyRelation> depsForTok =
          (List<DependencyRelation>) governorToken.getFeatures().get(dependenciesFeature);

        if(depsForTok == null) {
          depsForTok = new ArrayList<DependencyRelation>();
          governorToken.getFeatures().put(dependenciesFeature, depsForTok);
        }

        depsForTok.add(new DependencyRelation(dependencyKind, dependentTokenID));
      }
      
      
      if (addDependencyAnnotations) {
        depFeatures = gate.Factory.newFeatureMap();
        argList = new ArrayList<Integer>();
        argList.add(governorTokenID);
        argList.add(dependentTokenID);
        depFeatures.put(DEP_ARG_FEATURE, argList);
        depFeatures.put(DEP_LABEL_FEATURE, dependencyKind);
      
        offsetLH0 = governorToken.getStartNode().getOffset();
        offsetRH0 = governorToken.getEndNode().getOffset();
        offsetLH1 = dependentToken.getStartNode().getOffset();
        offsetRH1 = dependentToken.getEndNode().getOffset();
      
        depLH = Math.min(offsetLH0, offsetLH1);
        depRH = Math.max(offsetRH0, offsetRH1);
      
        try {
          annotationSet.add(depLH, depRH, DEP_ANNOTATION_TYPE, depFeatures);
        }
        catch(InvalidOffsetException e) {
          e.printStackTrace();
        }
      }
    }
  }

  

  private void instantiateStanfordParser()
    throws ResourceInstantiationException {
    try {
      String filepath = Files.fileFromURL(parserFile).getAbsolutePath();
      stanfordParser = new LexicalizedParser(filepath);
    }
    catch(Exception e) {
      throw new ResourceInstantiationException(e);
    }
  }	


  private void loadTagMapping(File mappingFile)  { 
    tagMap = new HashMap<String, String>();
    mappingLoaded = false;

    try {
      if (mappingFile.exists() && mappingFile.canRead()) {

        BufferedReader br = new BufferedReader(new FileReader(mappingFile));
        String line = "";

        // read until it reaches to an end of the file
        while((line = br.readLine()) != null) {
          // two columns delimited by whitespace 
          String [] data = line.split("\\s+", 2);

          // are there key and value available
          if(data == null || data.length < 2) {
            continue;
          } else {
            // and add it to the map
            tagMap.put(data[0].trim(), data[1].trim());
          }
        }

        br.close();
      }

      else {
        System.err.println("Can't find or read mapping file " 
          + mappingFile.getPath() + " so no mappings will be used.");
      }
    } 
    catch(Exception e) {
      System.err.println("Exception trying to load mapping file "
        + mappingFile.getPath());
      e.printStackTrace();
    }

    int nbrMapped = tagMap.size();
    System.out.println("Loaded " + nbrMapped + " mappings from file " + mappingFile);
    mappingLoaded = (nbrMapped > 0);
  }


  /**
   * This method stores the annotation ID as a value of feature "ID" on the
   * relevant annotation. (Mainly to make the ID visible in the GUI for
   * debugging.)
   * 
   * @param annSet
   * @param annotationID
   */
  private void recordID(AnnotationSet annSet, Integer annotationID) {
    annSet.get(annotationID).getFeatures().put("ID", annotationID);
  }


  /**
   * Translate the tag in the map, or leave it the same if there is no
   * translation.
   * 
   * @param stanfordTag
   * @return
   */
  private String translateTag(String stanfordTag) {
    String translatedTag = stanfordTag;

    if (tagMap.containsKey(stanfordTag)) {
      translatedTag = tagMap.get(stanfordTag);
    }
    
    return translatedTag;
  }


  protected AnnotationSet convertASName(String name) {
    if ((name == null) || name.equals("") ) {
      return document.getAnnotations();
    }

    /* implied else */
    return document.getAnnotations(name);
  }


  /* get & set methods for the CREOLE parameters */

  @Optional
  @RunTime
  @CreoleParameter(comment = "annotationSet used for input (Token and "
      + "Sentence annotations) and output")
  public void setAnnotationSetName(String annotationSetName) {
    this.annotationSetName = annotationSetName;
  }

  public String getAnnotationSetName() {
    return this.annotationSetName;
  }

  @Optional
  @CreoleParameter(comment = "path to the parser's grammar file",
      defaultValue = "resources/englishPCFG.ser.gz")
  public void setParserFile(URL parserFile) {
    this.parserFile = parserFile;
  }

  public URL getParserFile() {
    return this.parserFile;
  }

  @RunTime
  @CreoleParameter(comment = "The document to be processed")
  public void setDocument(gate.Document document) {
    this.document = document;
  }

  public gate.Document getDocument() {
    return this.document;
  }

  @RunTime
  @CreoleParameter(comment = "verbose mode for debugging",
      defaultValue = "false")
  public void setDebug(Boolean debug) {
    this.debugMode = debug.booleanValue();
  }

  public Boolean getDebug() {
    return new Boolean(this.debugMode);
  }
  
  @RunTime
  @CreoleParameter(comment = "Re-use existing POS tags on tokens",
      defaultValue = "false")
  public void setReusePosTags(Boolean reusePosTags) {
    this.reusePosTags = reusePosTags.booleanValue();
  }

  public Boolean getReusePosTags() {
    return new Boolean(this.reusePosTags);
  }
  
  @RunTime
  @CreoleParameter(comment = "Create POS tags on the Token annotations",
      defaultValue = "false")
  public void setAddPosTags(Boolean posTagTokens) {
    this.addPosTags = posTagTokens.booleanValue();
  }
  
  public Boolean getAddPosTags() {
    return new Boolean(this.addPosTags);
  }

  @RunTime
  @CreoleParameter(comment = "use tag mapping",
      defaultValue = "false")
  public void setUseMapping(Boolean useMapping) {
    this.useMapping = useMapping.booleanValue();
  }
  
  public Boolean getUseMapping() {
    return new Boolean(this.useMapping);
  }
  
  @RunTime
  @CreoleParameter(comment = "Create dependency features on Token annotations",
      defaultValue = "true")
  public void setAddDependencyFeatures(Boolean useDependency) {
    this.addDependencyFeatures = useDependency.booleanValue();
  }
  
  public Boolean getAddDependencyFeatures() {
    return new Boolean(this.addDependencyFeatures);
  }
  
  @RunTime
  @CreoleParameter(comment = "Create annotations to show dependencies",
      defaultValue = "true")
  public void setAddDependencyAnnotations(Boolean useDependency) {
    this.addDependencyAnnotations = useDependency.booleanValue();
  }
  
  public Boolean getAddDependencyAnnotations() {
    return new Boolean(this.addDependencyAnnotations);
  }
  
  @RunTime
  @CreoleParameter(comment = "Create annotations to show phrase structures",
      defaultValue = "true")
  public void setAddConstituentAnnotations(Boolean usePhraseStructure) {
    this.addConstituentAnnotations = usePhraseStructure.booleanValue();
  }
  
  public Boolean getAddConstituentAnnotations() {
    return new Boolean(this.addConstituentAnnotations);
  }
  
  /* Made mappingFile an init parameter to simplify things.
   * The CREOLE parameter is called "mappingFile" but it's actually a URL.
   */
  @Optional
  @CreoleParameter(comment = "path to the tag mapping file")
  public void setMappingFile(URL mappingFileURL) {
    this.mappingFile = null; // override below
    this.mappingFileURL = mappingFileURL;

    if ( (this.mappingFileURL != null) &&
      (! this.mappingFileURL.toString().trim().equals("")) ) {
      try {
        this.mappingFile = new File(this.mappingFileURL.toURI());
      }
      catch(URISyntaxException e) {
        e.printStackTrace();
      }
    }

  }

  public URL getMappingFile() {
    return this.mappingFileURL;
  }


}