package com.ontotext.russie.gazetteer; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import com.ontotext.russie.RussIEConstants; import com.ontotext.russie.morph.Lemma; import com.ontotext.russie.morph.LemmaImpl; public class InflectionalGazetteerXMLReader implements RussIEConstants, ContentHandler { private List<Lemma> lemmas; private String wordform; protected List<String> listImportTypes; private StringBuffer tagContent; static final String DEFAULT_PARSER = "org.apache.xerces.parsers.SAXParser"; public static final String TAG_RUSNAMES = "rusnames"; public static final String TAG_NAME = "name"; public static final String TAG_CAT = "cat"; public static final String TAG_CAT_END = "/cat"; public static final String TAG_FORM = "form"; public static final String TAG_TAG = "tag"; public static final String TAG_TAG_END = "/tag"; public static final String TAG_PH = "ph"; public static final String TAG_PH_END = "/ph"; public static final String ATTR_N = "n"; public static final String CAT_LOCATION = "Loc"; public static final String CAT_PERSON_FAMILY = "PerFamily"; public static final String CAT_PERSON_FIRST = "PerFirst"; public static final String CAT_PERSON = "Per"; private static Map<String, String> catVsMajorType; private String parserValue; Locator locator; Lemma lemma; String category; int occurance; String name; String type; static { catVsMajorType = new HashMap<String, String>(); catVsMajorType.put("Loc", "location"); catVsMajorType.put("Per", "person_full"); catVsMajorType.put("PerFamily", "surname"); catVsMajorType.put("PerFirst", "person_first"); } public InflectionalGazetteerXMLReader(List<String> importTypes) { lemmas = new ArrayList<Lemma>(); // listImportTypes = new ArrayList<String>(); tagContent = new StringBuffer(); parserValue = "org.apache.xerces.parsers.SAXParser"; locator = null; listImportTypes = importTypes; } public static String getMajorType4Category(String cat) { return catVsMajorType.get(cat); } public void load(String fileName) { File file = new File(fileName); load(file); } public void load(File file) { FileReader reader = null; try { reader = new FileReader(file); parse(reader); reader.close(); } catch(IOException e) { e.printStackTrace(); } catch(SAXException e) { e.printStackTrace(); } } public void setParser(String parserClass) { parserValue = parserClass; } public void parse(Reader r) throws IOException, SAXException { InputSource isrc = new InputSource(r); XMLReader reader = XMLReaderFactory.createXMLReader(parserValue); reader.setContentHandler(this); reader.parse(isrc); } public void parse(URL u) throws IOException, SAXException { InputSource isrc = new InputSource(u.toExternalForm()); InputStream stream = u.openStream(); try { isrc.setByteStream(stream); XMLReader reader = XMLReaderFactory.createXMLReader(parserValue); reader.setContentHandler(this); reader.parse(isrc); } finally { stream.close(); } } public void setDocumentLocator(Locator locator) { this.locator = locator; } public void startDocument() throws SAXException { } public void endDocument() throws SAXException { } public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { tagContent = new StringBuffer(); if(localName.equals("rusnames")) return; if(localName.equals("name")) { lemma = new LemmaImpl(); lemmas.add(lemma); return; } if(localName.equals("cat")) return; if(localName.equals("form")) { lemma.getFeatureMap().put("occurance", new Integer(atts.getValue(0))); return; } else { return; } } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if(localName.equals("cat")) try { String mtype = catVsMajorType.get(tagContent.toString().trim()); if(listImportTypes.contains(mtype)) lemma.getFeatureMap().put("majorType", mtype); } catch(Exception x) { System.out.println("Unknown Category :" + tagContent); } if(localName.equals("ph")) try { wordform = new String(tagContent.toString().getBytes(), "UTF-8"); wordform = wordform.trim(); } catch(UnsupportedEncodingException x) { } if(localName.equals("tag")) { type = tagContent.toString().trim(); lemma.addWordForm(wordform, type); if(type.charAt(0) == 'N' && type.charAt(type.length() - 1) == 'n' || type.charAt(0) == 'V' && type.charAt(type.length() - 1) == 'i') lemma.setMainForm(wordform, type); } tagContent = new StringBuffer(); } public void characters(char ch[], int start, int length) throws SAXException { tagContent.append(ch, start, length); } public void startPrefixMapping(String s, String s1) throws SAXException { } public void endPrefixMapping(String s) throws SAXException { } public void ignorableWhitespace(char ac[], int i, int j) throws SAXException { } public void processingInstruction(String s, String s1) throws SAXException { } public void skippedEntity(String s) throws SAXException { } public List<Lemma> getLemmas() { return new ArrayList<Lemma>(lemmas); } }