GATE.ac.uk - gate/plugins/Lang_Russian/src/com/ontotext/russie/morph/MorphologyReader.java

package com.ontotext.russie.morph;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;

import com.ontotext.russie.RussIEConstants;
import com.ontotext.russie.WrongFormatException;

/**
 * This class reads in the morphology with the word forms and creates lists of
 * Lemmas that present wordforms and varios other morhological and syntactic
 * information for them. It needs the path to the morphology to be set before
 * load-time, otherwise the default is used form RussIEConstants.
 * <p>
 * Title: RussIE
 * </p>
 * <p>
 * Description: Russian Information Extraction based on GATE
 * </p>
 * <p>
 * Copyright: Copyright (c) 2003
 * </p>
 * <p>
 * Company: Ontotext Lab.
 * </p>
 * 
 * @author borislav popov
 * @version 1.0
 */
public class MorphologyReader implements RussIEConstants {

  protected String encoding;

  private boolean caseSensitive;

  private final static String LINE_PREFIX = "le(";

  private final static String NO_LINE_PREFIX = "There is no prefix [" +
    LINE_PREFIX + "] in the current morphology line.";

  /** the set of lemmas */
  private Set<Lemma> lemmas;

  public MorphologyReader(boolean caseSensitive) {
    lemmas = new HashSet<Lemma>();
    this.caseSensitive = caseSensitive;
  }

  /** Loads the morphology files */
  public void load(URL url) throws IOException {

    BufferedReader mReader =
      new BufferedReader(new InputStreamReader(url.openStream(), encoding));
    try {
      String line;
      Lemma lemma;
      while((line = mReader.readLine()) != null) {
        try {
          lemma = constructLemmaByMorphologyLine(line);
          lemmas.add(lemma);
        } catch(WrongFormatException wfe) {
          System.out.println("WrongFormatException");
          System.out.println(wfe.getMessage());
          System.out.println("line :\n" + line);
        }
      } // while lines
    } finally {
      mReader.close();
    }
  } // load()

  /**
   * Gets the set of lemmas built from the morphology file.
   * 
   * @return the set of lemmas built from the morphology file.
   */
  public Set<Lemma> getLemmas() {
    return lemmas;
  }

  /**
   * Constructs a Lemma given a line from the Morphology.
   * 
   * @param line
   *          a line from the Morphology
   * @return the constructed Lemma
   * @throws WrongFormatException
   */
  private Lemma constructLemmaByMorphologyLine(String line)
    throws WrongFormatException {
    // example line : it is one line but has been idented for convinience.
    // le(
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0,
    // 'Nmisn',
    // [
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E0:'Nmisg',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00F3:'Nmisd',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0:'Nmisa',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00EE\u00EC:'Nmisi',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E5:'Nmisl',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00FB:'Nmipn',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00EE\u00E2:'Nmipg',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E0\u00EC:'Nmipd',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00FB:'Nmipa',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E0\u00EC\u00E8:'Nmipi',
    // \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E0\u00F5:'Nmipl'
    // ]).
    // check el prefix
    if(!line.substring(0, 3).equals(LINE_PREFIX)) { throw new WrongFormatException(
      NO_LINE_PREFIX); }

    // find the commas after the main wf and its type
    int commaIndex = line.indexOf(',', 3);
    int nextCommaIndex = line.indexOf(',', commaIndex + 1);

    // check main wf commas
    if(commaIndex < 0 || nextCommaIndex < 0) { throw new WrongFormatException(); }

    Lemma lemma = new LemmaImpl();

    String type = line.substring(commaIndex + 1, nextCommaIndex);
    type = removeQuote(type);
    lemma.setMainForm(removeQuote(line.substring(3, commaIndex)), type);

    if(!line.substring(nextCommaIndex + 1, nextCommaIndex + 2).equals("[")) { throw new WrongFormatException(); }

    // indicates whether there are more alternative word-forms to read
    boolean moreWf = true;

    if(line.substring(nextCommaIndex + 2, nextCommaIndex + 3).equals("]")) {
      // empty list of wfs
      moreWf = false;
    }

    int startOfWfCouple = nextCommaIndex + 2;
    String wf;
    while(moreWf) {
      commaIndex = line.indexOf(":", nextCommaIndex + 1);
      nextCommaIndex = line.indexOf(",", commaIndex + 1);
      if(commaIndex < 0) throw new WrongFormatException();
      if(nextCommaIndex < 0) {
        moreWf = false;
        nextCommaIndex = line.indexOf("]", commaIndex);
        if(nextCommaIndex < 0) throw new WrongFormatException();
      }

      type = line.substring(commaIndex + 1, nextCommaIndex);
      type = removeQuote(type);
      wf = removeQuote(line.substring(startOfWfCouple, commaIndex));
      if(!caseSensitive) {
        wf = wf.toLowerCase();
      }
      lemma.addWordForm(wf, type);

      startOfWfCouple = nextCommaIndex + 1;

    } // while there are WF

    lemma.synchWithSuffixPool();
    lemma.getSuffixNest().setMainFormSuffix(
      lemma.getMainForm().substring(lemma.getRoot().length()));
    return lemma;
  }// constructLemmaByMorphologyLine(line)

  public void setEncoding(String newEncoding) {
    encoding = newEncoding;
  }

  public String getEncoding() {
    return encoding;
  }

  /**
   * Removes the single quotes embracing a phrase. Presumes that if there is a
   * quote at the start - there is also a quote at the end.
   * 
   * @param phrase
   * @return the phrase without quotes
   */
  private String removeQuote(String phrase) {
    if(phrase == null || phrase.length() == 0) return phrase;
    if(phrase.substring(0, 1).equals("'")) {
      phrase = phrase.substring(1, phrase.length() - 1);
    }
    return phrase;
  }
} // class MorphologyReader