Log in Help
Print
Homesaletalksgate-course-jun14module-5-developers3-advanced-embeddedhands-onyam-formatsrcgatecoursemodule8 〉 YamDocumentFormat.java
 
package gatecourse.module8;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.GateConstants;
import gate.Resource;
import gate.corpora.MimeType;
import gate.corpora.TextualDocumentFormat;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.util.DocumentFormatException;
import gate.util.InvalidOffsetException;

@CreoleResource(name = "YAM Document Format",
        comment = "Simple document format annotating some elements of YAM syntax",
        autoinstances = {@AutoInstance})
public class YamDocumentFormat extends TextualDocumentFormat {


  /**
   * To keep things simple, this format does not support repositioning.
   */
  @Override
  public Boolean supportsRepositioning() {
    return false;
  }

  @Override
  public Resource init() throws ResourceInstantiationException {
    // TODO implement the format registration logic
    return this;
  }
  
  protected static final Pattern BOLD_PATTERN = Pattern.compile("(?<!\\\\)\\*(.+?)\\*", Pattern.DOTALL);
  protected static final Pattern ITALIC_PATTERN = Pattern.compile("(?<!\\\\)_(.+?)_", Pattern.DOTALL);
  protected static final Pattern TT_PATTERN = Pattern.compile("(?<!\\\\)\\^(.+?)\\^", Pattern.DOTALL);
  
  protected static final Pattern HEADING_PATTERN = Pattern.compile("^%(\\d+)\\s+(.*)$", Pattern.MULTILINE);
  
  @Override
  public void unpackMarkup(Document doc) throws DocumentFormatException {
    try {
      AnnotationSet om = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
      String docContent = doc.getContent().toString();
      Matcher m = BOLD_PATTERN.matcher(docContent);
      while(m.find()) {
        om.add((long)m.start(1), (long)m.end(1), "strong", Factory.newFeatureMap());
      }
      
      m = ITALIC_PATTERN.matcher(docContent);
      while(m.find()) {
        om.add((long)m.start(1), (long)m.end(1), "em", Factory.newFeatureMap());
      }
      
      m = TT_PATTERN.matcher(docContent);
      while(m.find()) {
        om.add((long)m.start(1), (long)m.end(1), "tt", Factory.newFeatureMap());
      }

      m = HEADING_PATTERN.matcher(docContent);
      while(m.find()) {
        om.add((long)m.start(2), (long)m.end(2), "h" + m.group(1), Factory.newFeatureMap());
      }
    }
    catch(InvalidOffsetException e) {
      throw new DocumentFormatException(e);
    }
  }

}