Log in Help
Print
HomegatepluginsTagger_PennBiosrceduupenncistaggersvariation 〉 VariationRegexTrieLexiconMembership.java
 
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
 This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
 http://www.cs.umass.edu/~mccallum/mallet
 This software is provided under the terms of the Common Public License,
 version 1.0, as published by http://www.opensource.org.  For further
 information, see the file `LICENSE' included with this distribution. */

/**
 * Tests membership of the token text in the provided list of regular expressions. The lexicon words are provided in a file, one space-separated phrase per
 * line.
 * 
 * @author Ryan McDonald <a href="mailto:ryantm@cis.upenn.edu">ryantm@cis.upenn.edu </a>
 */

package edu.upenn.cis.taggers.variation;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.Token;
import edu.umass.cs.mallet.base.types.TokenSequence;
import edu.upenn.cis.taggers.Constants;

public class VariationRegexTrieLexiconMembership extends Pipe implements Serializable {
    // Perhaps give it your own tokenizer?
    String name; // perhaps make this an array of names
    boolean ignoreCase;
    java.util.Vector lexicon;
    boolean indvMatch = true;

    public VariationRegexTrieLexiconMembership(String name, Reader lexiconReader, boolean ignoreCase) {
        this.name = name;
        this.lexicon = new java.util.Vector();
        LineNumberReader reader = new LineNumberReader(lexiconReader);
        String line;
        while (true) {
            try {
                line = reader.readLine();
            } catch (IOException e) {
                throw new IllegalStateException();
            }
            if (line == null) {
                break;
            } else {
                StringTokenizer st = new StringTokenizer(line, " ");
                int numToks = st.countTokens();
                if (numToks > 0) {
                    lexicon.add(new Pattern[numToks]);
                    Pattern[] pats = (Pattern[]) lexicon.elementAt(lexicon.size() - 1);
                    for (int i = 0; i < numToks; i++) {
                        pats[i] = Pattern.compile(ignoreCase ? st.nextToken().toLowerCase() : st.nextToken());
                    }
                }
            }
        }
        if (lexicon.size() == 0)
            throw new IllegalArgumentException("Empty lexicon");
    }

    public VariationRegexTrieLexiconMembership(String name, File lexiconFile, boolean ignoreCase) throws FileNotFoundException {
        this(name, new BufferedReader(new FileReader(lexiconFile)), ignoreCase);
    }

    public VariationRegexTrieLexiconMembership(File lexiconFile, boolean ignoreCase) throws FileNotFoundException {
        this(lexiconFile.getName(), lexiconFile, ignoreCase);
    }

    public VariationRegexTrieLexiconMembership(File lexiconFile) throws FileNotFoundException {
        this(lexiconFile.getName(), lexiconFile, true);
    }

    public Instance pipe(Instance carrier) {
        TokenSequence ts = (TokenSequence) carrier.getData();
        for (int i = 0; i < ts.size(); i++) {

            for (int j = 0; j < lexicon.size(); j++) {
                Pattern[] pats = (Pattern[]) lexicon.elementAt(j);
                boolean matched = true;

                for (int k = 0; k < pats.length && i + k < ts.size(); k++) {
                    Token tok = ts.getToken(i + k);
                    String t = tok.getText().intern();
                    if (!(pats[k].matcher(ignoreCase ? t.toLowerCase() : t)).matches()) {
                        matched = false;
                        break;
                    }
                }

                if (matched) {
                    for (int k = 0; k < pats.length && i + k < ts.size(); k++)
                        ts.getToken(i + k).setFeatureValue(name + (indvMatch ? "" + j : ""), 1.0);
                    i = i + pats.length - 1;
                    break;
                }

            }

        }
        return carrier;
    }

    // Serialization

    private static final long serialVersionUID = Constants.SVUID_VARIATION_REGEX_TRIELEXICON_MEMBERSHIP;
    private static final int CURRENT_SERIAL_VERSION = 0;

    private void writeObject(ObjectOutputStream out) throws IOException {
        out.writeInt(CURRENT_SERIAL_VERSION);
        out.writeObject(name);
        out.writeObject(lexicon);
        out.writeBoolean(ignoreCase);
        out.writeBoolean(indvMatch);
    }

    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
        int version = in.readInt();
        this.name = (String) in.readObject();
        this.lexicon = (java.util.Vector) in.readObject();
        this.ignoreCase = in.readBoolean();
        this.indvMatch = in.readBoolean();
    }
}