Log in Help
Print
HomegatepluginsLanguage_Identificationsrcatknallgrautextcat 〉 FingerPrint.java
 
package at.knallgrau.textcat;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author Thomas Hammerl A FingerPrint maps so called NGrams to their number of
 *         occurences in the corresponding text. It is able to categorize itself
 *         by comparing its FingerPrint with the FingerPrints of a collection of
 *         categories. See sdair-94-bc.pdf in the doc direcory of the jar-file
 *         for more information.
 * 
 */
public class FingerPrint {

    private Map<String, Integer> ngrams = new HashMap<String, Integer>();

    private final Pattern filePattern = Pattern.compile("-(.*)\\.lm$");

    private final Pattern pattern = Pattern.compile("^_?[^0-9\\?!\\-_/]*_?$");

    private String category = "unknown";

    private Map<String, Integer> categoryDistances = new HashMap<String, Integer>();

    /**
         * Set of NGrams sorted by the number of occurences in the text which
         * was used for creating the FingerPrint.
         * 
         */
    private SortedSet<Entry<String, Integer>> entries = new TreeSet<Entry<String, Integer>>(
	    new NGramEntryComparator());;

    public FingerPrint() {
    }

    /**
         * creates a FingerPrint by reading the FingerPrint-file referenced by
         * the passed path.
         * 
         * @param file
         *                path to the FingerPrint-file
         * @throws FingerPrintFileException
         */
    public FingerPrint(String file) throws FingerPrintFileException {
	this.loadFingerPrintFromFile(file);
    }

    /**
         * creates a FingerPrint by reading it with the passed InputStream
         * 
         * @param is
         *                InputStream for reading the FingerPrint
         * @throws FingerPrintFileException
         */
    public FingerPrint(InputStream is) throws FingerPrintFileException {
	this.loadFingerPrintFromInputStream(is);
    }

    public FingerPrint(InputStream is, String encoding)
	    throws FingerPrintFileException {
	this.loadFingerPrintFromInputStream(is, encoding);
    }

    /**
         * creates a FingerPrint by analysing the content of the given file.
         * 
         * @param file
         *                file to be analysed
         * @throws FileNotFoundException
         *                 thrown when given file does not exist
         */
    public void create(File file) throws FileNotFoundException {
	char[] data = new char[1024];
	String s = "";
	int read;
	FileReader fr = new FileReader(file);
	try {
	    while ((read = fr.read(data)) != -1) {
		s += new String(data, 0, read);
	    }
	    fr.close();
	} catch (IOException ioe) {
	    ioe.printStackTrace();
	    return;
	}
	this.create(s);
    }

    /**
         * fills the FingerPrint with all the NGrams and their numer of
         * occurences in the passed text.
         * 
         * @param text
         *                text to be analysed
         */
    public void create(String text) {
	this.ngrams.clear();
	this.computeNGrams(1, 5, text);
	if (this.ngrams.containsKey("_")) {
	    int blanksScore = this.ngrams.remove("_");
	    this.ngrams.put("_", blanksScore / 2);
	}

	this.entries.clear();
	this.entries.addAll(this.ngrams.entrySet());
    }

    /**
         * adds all NGrams with the passed order occuring in the given text to
         * the FingerPrint. For example:
         * 
         * text = "text" startOrder = 2, maxOrder = 2
         * 
         * so the NGrams added to the FingerPrint are:
         * 
         * "_t", "te", "ex", "xt", "t_"
         * 
         * all with a score (occurence) of 1
         * 
         * @param startOrder
         * @param maxOrder
         * @param text
         */
    private void computeNGrams(int startOrder, int maxOrder, String text) {
	String[] tokens = text.split("\\s");

	for (int order = startOrder; order <= maxOrder; ++order) {

	    for (String token : tokens) {
		token = "_" + token + "_";

		for (int i = 0; i < (token.length() - order + 1); i++) {
		    String ngram = token.substring(i, i + order);

		    Matcher matcher = pattern.matcher(ngram);
		    if (!matcher.find()) {
			continue;
		    } else if (!this.ngrams.containsKey(ngram)) {
			this.ngrams.put(ngram, 1);
		    } else {
			int score = this.ngrams.remove(ngram);
			this.ngrams.put(ngram, ++score);
		    }
		}
	    }
	}
    }

    /**
         * categorizes the FingerPrint by computing the distance to the
         * FingerPrints in the passed Collection. the category of the
         * FingerPrint with the lowest distance is assigned to this FingerPrint.
         * 
         * @param categories
         */
    public Map<String, Integer> categorize(Collection<FingerPrint> categories) {
	int minDistance = Integer.MAX_VALUE;
	for (FingerPrint fp : categories) {
	    int distance = this.computeDistanceTo(fp);
	    this.getCategoryDistances().put(fp.getCategory(), distance);
	    if (distance < minDistance) {
		minDistance = distance;
		this.category = fp.getCategory();
	    }
	}
	return this.getCategoryDistances();
    }

    public Map<String, Integer> getCategoryDistances() {
	return this.categoryDistances;
    }

    /**
         * computes and returns the distance of this FingerPrint to the
         * FingerPrint passed to the method.
         * 
         * @param category
         *                the FingerPrint to be compared to this one
         * @return the distance of the passed FingerPrint to this FingerPrint
         */
    private int computeDistanceTo(FingerPrint category) {
	int distance = 0;
	int count = 0;
	for (Entry<String, Integer> entry : this.entries) {
	    String ngram = entry.getKey();
	    count++;
	    if (count > 400) {
		break;
	    }
	    if (!category.containsNgram(ngram)) {
		distance += category.numNgrams();
	    } else {
		distance += Math.abs(this.getPosition(ngram)
			- category.getPosition(ngram));
	    }
	}
	return distance;
    }

    public boolean containsNgram(String ngram) {
	return this.ngrams.containsKey(ngram);
    }

    public int numNgrams() {
	return this.ngrams.size();
    }

    /**
         * reads a FingerPrint from the passed InputStream
         * 
         * @param is
         *                InputStream to be read
         * @throws FingerPrintFileException
         */
    private void loadFingerPrintFromInputStream(InputStream is)
	    throws FingerPrintFileException {
	this.loadFingerPrintFromInputStream(is, "UTF-8");
    }

    private void loadFingerPrintFromInputStream(InputStream is, String encoding)
	    throws FingerPrintFileException {
	this.entries.clear();
	MyProperties properties = new MyProperties();
	try {
	    String line;
	    InputStreamReader isr = new InputStreamReader(is, encoding);
	    BufferedReader reader = new BufferedReader(isr);
	    while ((line = reader.readLine()) != null) {
		if (!line.equals("")) {
		    String[] property = line.split("\\s+");
		    if (property.length >= 2) {
			properties.put(property[0], property[1]);
		    }
		}
	    }
	    /* properties.load(is); */
	    for (Entry<String, String> entry : properties.entrySet()) {
		this.ngrams.put(entry.getKey(), Integer.parseInt(entry
			.getValue()));
	    }
	    entries.addAll(this.ngrams.entrySet());
	} catch (UnsupportedEncodingException e) {
	    throw new FingerPrintFileException(e);
	} catch (IOException e) {
	    throw new FingerPrintFileException(e);
	}
    }

    /**
         * reads a FingerPrint from the file referenced by the passed path
         * 
         * @param file
         *                FingerPrint file to be read
         * @throws FingerPrintFileException
         */
    private void loadFingerPrintFromFile(String file)
	    throws FingerPrintFileException {
	File fpFile = new File(file);
	if (!fpFile.isDirectory()) {
	    try {
		String encoding = null;
		File f = new File(file);
		Matcher matcher = filePattern.matcher(f.getName());
		if (matcher.matches()) {
		    encoding = matcher.group(1);
		}

		FileInputStream fis = new FileInputStream(file.toString());

		if (encoding != null) {
		    this.loadFingerPrintFromInputStream(fis, encoding);
		} else {
		    this.loadFingerPrintFromInputStream(fis);
		}
	    } catch (FileNotFoundException e) {
		throw new FingerPrintFileException(e);
	    }
	}
    }

    /**
         * gets the position of the NGram passed to method in the FingerPrint.
         * the NGrams are in descending order according to the number of
         * occurences in the text which was used creating the FingerPrint.
         * 
         * @param key
         *                the NGram
         * @return the position of the NGram in the FingerPrint
         */
    public int getPosition(String key) {
	int pos = 1;

	int value = this.entries.first().getValue();
	for (Entry<String, Integer> entry : this.entries) {
	    if (value != entry.getValue()) {
		value = entry.getValue();
		pos++;
	    }
	    if (entry.getKey().equals(key)) {
		return pos;
	    }
	}
	return -1;
    }

    /**
         * saves the fingerprint to a file named <categoryname>.lm in the
         * execution path.
         */
    public void save() {
	File file = new File(this.getCategory() + "-utf8.lm");
	try {
	    if (file.createNewFile()) {
		FileOutputStream fos = new FileOutputStream(file);
		fos.write(this.toString().getBytes("utf8"));
		fos.close();
	    }
	} catch (FileNotFoundException fnfe) {
	    fnfe.printStackTrace();
	} catch (IOException ioe) {
	    ioe.printStackTrace();
	}
    }

    /**
         * returns the category of the FingerPrint or "unknown" if the
         * FingerPrint wasn't categorized yet.
         * 
         * @return the category of the FingerPrint
         */
    public String getCategory() {
	return this.category;
    }

    /**
         * returns the FingerPrint as a String in the FingerPrint file-format
         */
    public String toString() {
	String s = "";
	for (Entry<String, Integer> entry : entries) {
	    s += entry.getKey() + "\t" + entry.getValue() + "\n";
	}
	return s;
    }

    /**
         * sets the category of the FingerPrint
         * 
         * @param category
         *                the category
         */
    protected void setCategory(String category) {
	this.category = category;
    }

}