Log in Help
Print
HomegatepluginsLang_Russiansrccomontotextrussiemorph 〉 POSMapper.java
 
// Decompiled by Jad v1.5.8e. Copyright 2001 Pavel Kouznetsov.
// Jad home page: http://www.geocities.com/kpdus/jad.html
// Decompiler options: packimports(3)
// Source File Name: POSMapper.java

package com.ontotext.russie.morph;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Resource;
import gate.annotation.AnnotationSetImpl;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.OffsetComparator;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;

import com.ontotext.russie.RussIEConstants;

@CreoleResource(name = "POS Mapper",
    comment = "Map complex Russian morphology tags into simpler POS categories",
    helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:language-plugins:russian")
public class POSMapper extends AbstractLanguageAnalyser implements
  RussIEConstants {

  private static final long serialVersionUID = 748287388332117288L;

  public Resource init() throws ResourceInstantiationException {
    fireStatusChanged("Init POS Mapper structures ...");
    initMap();
    fireProcessFinished();
    return this;
  }

  private void initMap() {
    categoriesMap = new HashMap<String, String>();
    String adjTypes =
      "Au,Aupfpaa,Aupfpai,Aupfpd,Aupfpg,Aupfpi,Aupfpl,Aupfpn,Aupfsfa,Aupfsfd,Aupfsfg,Aupfsfi,Aupfsfl,Aupfsfn,Aupfsmaa,Aupfsmai,Aupfsmd,Aupfsmg,Aupfsmi,Aupfsml,Aupfsmn,Aupfsna,Aupfsnd,Aupfsng,Aupfsni,Aupfsnl,Aupfsnn,Aupsp,Aupssf,Aupssm,Aupssn";
    String type;
    for(StringTokenizer tok = new StringTokenizer(adjTypes, ","); tok
      .hasMoreElements(); categoriesMap.put(type, "JJ"))
      type = tok.nextToken();

    fireStatusChanged("Categories in map count: " + categoriesMap.size());
    categoriesMap.put("Auc", "JJR");
    categoriesMap.put("C", "CC");
    categoriesMap.put("D", "RB");
    categoriesMap.put("T", "RP");
    categoriesMap.put("P", "PP");
    categoriesMap.put("R", "IN");
    fireStatusChanged("Categories in map count: " + categoriesMap.size());
    String nnTypes =
      "Nfa,Nfi,Nma,Nmi,Nmi2g,Nmi2l,Nmi2lg,Nmi2lgs2g,Nmi2lgs2l,Nmi2ls2g,Nna,Nni,Nfasa,Nfasd,Nfasg,Nfasi,Nfasl,Nfasn,Nfisa,Nfisd,Nfisg,Nfisi,Nfisl,Nfisn,Nmasa,Nmasd,Nmasg,Nmasi,Nmasl,Nmasn,Nmi2gsa,Nmi2gsd,Nmi2gsg,Nmi2gsi,Nmi2gsl,Nmi2gsn,Nmi2lgsa,Nmi2lgsd,Nmi2lgsg,Nmi2lgsi,Nmi2lgsl,Nmi2lgsn,Nmi2lsa,Nmi2lsd,Nmi2lsg,Nmi2lsi,Nmi2lsl,Nmi2lsn,Nmisa,Nmisd,Nmisg,Nmisi,Nmisl,Nmisn,Nnasa,Nnasd,Nnasg,Nnasi,Nnasl,Nnasn,Nnisa,Nnisd,Nnisg,Nnisi,Nnisl,Nnisn";
    for(StringTokenizer tok = new StringTokenizer(nnTypes, ","); tok
      .hasMoreElements(); categoriesMap.put(type, "NN"))
      type = tok.nextToken();

    fireStatusChanged("Categories in map count: " + categoriesMap.size());
    String nnsTypes =
      "Nfapa,Nfapd,Nfapg,Nfapi,Nfapl,Nfapn,Nfipa,Nfipd,Nfipg,Nfipi,Nfipl,Nfipn,Nmapa,Nmapd,Nmapg,Nmapi,Nmapl,Nmapn,Nmi2gpa,Nmi2gpd,Nmi2gpg,Nmi2gpi,Nmi2gpl,Nmi2gpn,Nmi2lgpa,Nmi2lgpd,Nmi2lgpg,Nmi2lgpi,Nmi2lgpl,Nmi2lgpn,Nmi2lpa,Nmi2lpd,Nmi2lpg,Nmi2lpi,Nmi2lpl,Nmi2lpn,Nmipa,Nmipd,Nmipg,Nmipi,Nmipl,Nmipn,Nnapa,Nnapd,Nnapg,Nnapi,Nnapl,Nnapn,Nnipa,Nnipd,Nnipg,Nnipi,Nnipl,Nnipn,Np,Nppa,Nppd,Nppg,Nppi,Nppl,Nppn";
    for(StringTokenizer tok = new StringTokenizer(nnsTypes, ","); tok
      .hasMoreElements(); categoriesMap.put(type, "NNS"))
      type = tok.nextToken();

    String nnpTypes =
      "Npfa,Npfi,Npma,Npmi,Npmi2g,Npmi2l,Npmi2lg,Npmi2lgs2g,Npmi2lgs2l,Npmi2ls2g,Npna,Npni,Npfasa,Npfasd,Npfasg,Npfasi,Npfasl,Npfasn,Npfisa,Npfisd,Npfisg,Npfisi,Npfisl,Npfisn,Npmasa,Npmasd,Npmasg,Npmasi,Npmasl,Npmasn,Npmi2gsa,Npmi2gsd,Npmi2gsg,Npmi2gsi,Npmi2gsl,Npmi2gsn,Npmi2lgsa,Npmi2lgsd,Npmi2lgsg,Npmi2lgsi,Npmi2lgsl,Npmi2lgsn,Npmi2lsa,Npmi2lsd,Npmi2lsg,Npmi2lsi,Npmi2lsl,Npmi2lsn,Npmisa,Npmisd,Npmisg,Npmisi,Npmisl,Npmisn,Npnasa,Npnasd,Npnasg,Npnasi,Npnasl,Npnasn,Npnisa,Npnisd,Npnisg,Npnisi,Npnisl,Npnisn";
    for(StringTokenizer tok = new StringTokenizer(nnpTypes, ","); tok
      .hasMoreElements(); categoriesMap.put(type, "NNP"))
      type = tok.nextToken();

    String nnpsTypes =
      "Npfapa,Npfapd,Npfapg,Npfapi,Npfapl,Npfapn,Npfipa,Npfipd,Npfipg,Npfipi,Npfipl,Npfipn,Npmapa,Npmapd,Npmapg,Npmapi,Npmapl,Npmapn,Npmi2gpa,Npmi2gpd,Npmi2gpg,Npmi2gpi,Npmi2gpl,Npmi2gpn,Npmi2lgpa,Npmi2lgpd,Npmi2lgpg,Npmi2lgpi,Npmi2lgpl,Npmi2lgpn,Npmi2lpa,Npmi2lpd,Npmi2lpg,Npmi2lpi,Npmi2lpl,Npmi2lpn,Npmipa,Npmipd,Npmipg,Npmipi,Npmipl,Npmipn,Npnapa,Npnapd,Npnapg,Npnapi,Npnapl,Npnapn,Npnipa,Npnipd,Npnipg,Npnipi,Npnipl,Npnipn,Npp,Npppa,Npppd,Npppg,Npppi,Npppl,Npppn";
    for(StringTokenizer tok = new StringTokenizer(nnpsTypes, ","); tok
      .hasMoreElements(); categoriesMap.put(type, "NNPS"))
      type = tok.nextToken();

    fireStatusChanged("Categories in map count: " + categoriesMap.size());
    showMessage("Categories in map count: " + categoriesMap.size());
  }

  public void execute() throws ExecutionException {
    if(super.document == null)
      throw new ExecutionException("No document to process!");
    AnnotationSet annotationSet;
    if(inputASName == null || inputASName.equals(""))
      annotationSet = super.document.getAnnotations();
    else annotationSet = super.document.getAnnotations(inputASName);
    if(annotationSet == null) {
      showMessage("No annotation set!");
      return;
    } else {
      fireStatusChanged("Mapping of morphology information...");
      mapCategories(annotationSet);
      fireProcessFinished();
      fireStatusChanged("POS Mapper processing finished!");
      return;
    }
  }

  private void mapCategories(AnnotationSet annSet) {
    AnnotationSet msdSet = annSet.get("MSD");
    if(msdSet == null) {
      showMessage("No annotations from type MSD");
      return;
    }
    AnnotationSetImpl tokenSet = (AnnotationSetImpl)annSet.get("Token");
    if(tokenSet == null) {
      showMessage("No annotations from type Token");
      return;
    }
    HashSet<String> mappedTypes = new HashSet<String>();
    Annotation list[] = msdSet.toArray(new Annotation[mappedTypes.size()]);
    Arrays.sort(list, new OffsetComparator());
    for(int index = 0; index < list.length;) {
      Annotation msdAnn;
      String tokType;
      do {
        msdAnn = list[index];
        String msdType = (String)msdAnn.getFeatures().get("type");
        if(msdType != null) {
          tokType = categoriesMap.get(msdType);
          if(tokType != null) mappedTypes.add(tokType);
        }
      } while(++index < list.length && msdAnn.compareTo(list[index]) == 0);
      tokType = "";
      for(Iterator<String> it = mappedTypes.iterator(); it.hasNext();)
        tokType = tokType + it.next() + " ";

      tokType = tokType.trim();
      AnnotationSet singleToken =
        tokenSet.getStrict(msdAnn.getStartNode().getOffset(), msdAnn
          .getEndNode().getOffset());
      if(singleToken.size() > 0 && tokType.length() > 0) {
        Annotation tokAnn = singleToken.iterator().next();
        tokAnn.getFeatures().put("category", tokType);
      } else if(singleToken.size() == 0)
        showMessage("No token annotation for MSD annotation: " + msdAnn);
      else showMessage("No recognized category for MSD annotation: " + msdAnn);
      mappedTypes.clear();
    }

  }

  @Optional
  @RunTime
  @CreoleParameter(comment = "The name of the annotation set containing "
    + "MSD and Token annotations")
  public void setInputASName(String newInputASName) {
    inputASName = newInputASName;
  }

  public String getInputASName() {
    return inputASName;
  }

  private void showMessage(String s) {
  }

  protected static final boolean DEBUG = false;

  protected static final boolean DETAILED_DEBUG = false;

  protected String inputASName;

  protected Map<String, String> categoriesMap;
}