Log in Help
Print
HomegatepluginsLang_Russiansrccomontotextrussiegazetteer 〉 InflectionalGazetteerXMLReader.java
 
package com.ontotext.russie.gazetteer;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import com.ontotext.russie.RussIEConstants;
import com.ontotext.russie.morph.Lemma;
import com.ontotext.russie.morph.LemmaImpl;

public class InflectionalGazetteerXMLReader implements RussIEConstants,
  ContentHandler {

  private List<Lemma> lemmas;

  private String wordform;

  protected List<String> listImportTypes;

  private StringBuffer tagContent;

  static final String DEFAULT_PARSER = "org.apache.xerces.parsers.SAXParser";

  public static final String TAG_RUSNAMES = "rusnames";

  public static final String TAG_NAME = "name";

  public static final String TAG_CAT = "cat";

  public static final String TAG_CAT_END = "/cat";

  public static final String TAG_FORM = "form";

  public static final String TAG_TAG = "tag";

  public static final String TAG_TAG_END = "/tag";

  public static final String TAG_PH = "ph";

  public static final String TAG_PH_END = "/ph";

  public static final String ATTR_N = "n";

  public static final String CAT_LOCATION = "Loc";

  public static final String CAT_PERSON_FAMILY = "PerFamily";

  public static final String CAT_PERSON_FIRST = "PerFirst";

  public static final String CAT_PERSON = "Per";

  private static Map<String, String> catVsMajorType;

  private String parserValue;

  Locator locator;

  Lemma lemma;

  String category;

  int occurance;

  String name;

  String type;

  static {
    catVsMajorType = new HashMap<String, String>();
    catVsMajorType.put("Loc", "location");
    catVsMajorType.put("Per", "person_full");
    catVsMajorType.put("PerFamily", "surname");
    catVsMajorType.put("PerFirst", "person_first");
  }

  public InflectionalGazetteerXMLReader(List<String> importTypes) {
    lemmas = new ArrayList<Lemma>();
    // listImportTypes = new ArrayList<String>();
    tagContent = new StringBuffer();
    parserValue = "org.apache.xerces.parsers.SAXParser";
    locator = null;
    listImportTypes = importTypes;
  }

  public static String getMajorType4Category(String cat) {
    return catVsMajorType.get(cat);
  }

  public void load(String fileName) {
    File file = new File(fileName);
    load(file);
  }

  public void load(File file) {
    FileReader reader = null;
    try {
      reader = new FileReader(file);
      parse(reader);
      reader.close();
    } catch(IOException e) {
      e.printStackTrace();
    } catch(SAXException e) {
      e.printStackTrace();
    }
  }

  public void setParser(String parserClass) {
    parserValue = parserClass;
  }

  public void parse(Reader r) throws IOException, SAXException {
    InputSource isrc = new InputSource(r);
    XMLReader reader = XMLReaderFactory.createXMLReader(parserValue);
    reader.setContentHandler(this);
    reader.parse(isrc);
  }

  public void parse(URL u) throws IOException, SAXException {
    InputSource isrc = new InputSource(u.toExternalForm());
    InputStream stream = u.openStream();
    try {
      isrc.setByteStream(stream);
      XMLReader reader = XMLReaderFactory.createXMLReader(parserValue);
      reader.setContentHandler(this);
      reader.parse(isrc);
    } finally {
      stream.close();
    }
  }

  public void setDocumentLocator(Locator locator) {
    this.locator = locator;
  }

  public void startDocument() throws SAXException {
  }

  public void endDocument() throws SAXException {
  }

  public void startElement(String namespaceURI, String localName, String qName,
                           Attributes atts) throws SAXException {
    tagContent = new StringBuffer();
    if(localName.equals("rusnames")) return;
    if(localName.equals("name")) {
      lemma = new LemmaImpl();
      lemmas.add(lemma);
      return;
    }
    if(localName.equals("cat")) return;
    if(localName.equals("form")) {
      lemma.getFeatureMap().put("occurance", new Integer(atts.getValue(0)));
      return;
    } else {
      return;
    }
  }

  public void endElement(String namespaceURI, String localName, String qName)
    throws SAXException {
    if(localName.equals("cat"))
      try {
        String mtype = catVsMajorType.get(tagContent.toString().trim());
        if(listImportTypes.contains(mtype))
          lemma.getFeatureMap().put("majorType", mtype);
      } catch(Exception x) {
        System.out.println("Unknown Category :" + tagContent);
      }
    if(localName.equals("ph")) try {
      wordform = new String(tagContent.toString().getBytes(), "UTF-8");
      wordform = wordform.trim();
    } catch(UnsupportedEncodingException x) {
    }
    if(localName.equals("tag")) {
      type = tagContent.toString().trim();
      lemma.addWordForm(wordform, type);
      if(type.charAt(0) == 'N' && type.charAt(type.length() - 1) == 'n' ||
        type.charAt(0) == 'V' && type.charAt(type.length() - 1) == 'i')
        lemma.setMainForm(wordform, type);
    }
    tagContent = new StringBuffer();
  }

  public void characters(char ch[], int start, int length) throws SAXException {
    tagContent.append(ch, start, length);
  }

  public void startPrefixMapping(String s, String s1) throws SAXException {
  }

  public void endPrefixMapping(String s) throws SAXException {
  }

  public void ignorableWhitespace(char ac[], int i, int j) throws SAXException {
  }

  public void processingInstruction(String s, String s1) throws SAXException {
  }

  public void skippedEntity(String s) throws SAXException {
  }

  public List<Lemma> getLemmas() {
    return new ArrayList<Lemma>(lemmas);
  }

}