package com.ontotext.russie.morph;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import com.ontotext.russie.RussIEConstants;
import com.ontotext.russie.WrongFormatException;
/**
* This class reads in the morphology with the word forms and creates lists of
* Lemmas that present wordforms and varios other morhological and syntactic
* information for them. It needs the path to the morphology to be set before
* load-time, otherwise the default is used form RussIEConstants.
* <p>
* Title: RussIE
* </p>
* <p>
* Description: Russian Information Extraction based on GATE
* </p>
* <p>
* Copyright: Copyright (c) 2003
* </p>
* <p>
* Company: Ontotext Lab.
* </p>
*
* @author borislav popov
* @version 1.0
*/
public class MorphologyReader implements RussIEConstants {
protected String encoding;
private boolean caseSensitive;
private final static String LINE_PREFIX = "le(";
private final static String NO_LINE_PREFIX = "There is no prefix [" +
LINE_PREFIX + "] in the current morphology line.";
/** the set of lemmas */
private Set<Lemma> lemmas;
public MorphologyReader(boolean caseSensitive) {
lemmas = new HashSet<Lemma>();
this.caseSensitive = caseSensitive;
}
/** Loads the morphology files */
public void load(URL url) throws IOException {
BufferedReader mReader =
new BufferedReader(new InputStreamReader(url.openStream(), encoding));
try {
String line;
Lemma lemma;
while((line = mReader.readLine()) != null) {
try {
lemma = constructLemmaByMorphologyLine(line);
lemmas.add(lemma);
} catch(WrongFormatException wfe) {
System.out.println("WrongFormatException");
System.out.println(wfe.getMessage());
System.out.println("line :\n" + line);
}
} // while lines
} finally {
mReader.close();
}
} // load()
/**
* Gets the set of lemmas built from the morphology file.
*
* @return the set of lemmas built from the morphology file.
*/
public Set<Lemma> getLemmas() {
return lemmas;
}
/**
* Constructs a Lemma given a line from the Morphology.
*
* @param line
* a line from the Morphology
* @return the constructed Lemma
* @throws WrongFormatException
*/
private Lemma constructLemmaByMorphologyLine(String line)
throws WrongFormatException {
// example line : it is one line but has been idented for convinience.
// le(
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0,
// 'Nmisn',
// [
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E0:'Nmisg',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00F3:'Nmisd',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0:'Nmisa',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00EE\u00EC:'Nmisi',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E5:'Nmisl',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00FB:'Nmipn',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00EE\u00E2:'Nmipg',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E0\u00EC:'Nmipd',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00FB:'Nmipa',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E0\u00EC\u00E8:'Nmipi',
// \u00E0\u00E1\u00E0\u00E6\u00F3\u00F0\u00E0\u00F5:'Nmipl'
// ]).
// check el prefix
if(!line.substring(0, 3).equals(LINE_PREFIX)) { throw new WrongFormatException(
NO_LINE_PREFIX); }
// find the commas after the main wf and its type
int commaIndex = line.indexOf(',', 3);
int nextCommaIndex = line.indexOf(',', commaIndex + 1);
// check main wf commas
if(commaIndex < 0 || nextCommaIndex < 0) { throw new WrongFormatException(); }
Lemma lemma = new LemmaImpl();
String type = line.substring(commaIndex + 1, nextCommaIndex);
type = removeQuote(type);
lemma.setMainForm(removeQuote(line.substring(3, commaIndex)), type);
if(!line.substring(nextCommaIndex + 1, nextCommaIndex + 2).equals("[")) { throw new WrongFormatException(); }
// indicates whether there are more alternative word-forms to read
boolean moreWf = true;
if(line.substring(nextCommaIndex + 2, nextCommaIndex + 3).equals("]")) {
// empty list of wfs
moreWf = false;
}
int startOfWfCouple = nextCommaIndex + 2;
String wf;
while(moreWf) {
commaIndex = line.indexOf(":", nextCommaIndex + 1);
nextCommaIndex = line.indexOf(",", commaIndex + 1);
if(commaIndex < 0) throw new WrongFormatException();
if(nextCommaIndex < 0) {
moreWf = false;
nextCommaIndex = line.indexOf("]", commaIndex);
if(nextCommaIndex < 0) throw new WrongFormatException();
}
type = line.substring(commaIndex + 1, nextCommaIndex);
type = removeQuote(type);
wf = removeQuote(line.substring(startOfWfCouple, commaIndex));
if(!caseSensitive) {
wf = wf.toLowerCase();
}
lemma.addWordForm(wf, type);
startOfWfCouple = nextCommaIndex + 1;
} // while there are WF
lemma.synchWithSuffixPool();
lemma.getSuffixNest().setMainFormSuffix(
lemma.getMainForm().substring(lemma.getRoot().length()));
return lemma;
}// constructLemmaByMorphologyLine(line)
public void setEncoding(String newEncoding) {
encoding = newEncoding;
}
public String getEncoding() {
return encoding;
}
/**
* Removes the single quotes embracing a phrase. Presumes that if there is a
* quote at the start - there is also a quote at the end.
*
* @param phrase
* @return the phrase without quotes
*/
private String removeQuote(String phrase) {
if(phrase == null || phrase.length() == 0) return phrase;
if(phrase.substring(0, 1).equals("'")) {
phrase = phrase.substring(1, phrase.length() - 1);
}
return phrase;
}
} // class MorphologyReader