Log in Help
Print
HomegatepluginsLanguage_Identificationsrcatknallgrautextcat 〉 TextCategorizer.java
 
package at.knallgrau.textcat;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.Map.Entry;


/**
 * @author Thomas Hammerl
 * 
 * TextCategorizer is able to categorize texts by computing the similarity of
 * the FingerPrint of a text with a collection of the FingerPrints of the
 * categories.
 * 
 */
public class TextCategorizer {
	
  private URL confURL = null;

	private final static int UNKNOWN_LIMIT = 20;

	private Collection<FingerPrint> fingerprints = new ArrayList<FingerPrint>();

	public TextCategorizer(Collection<FingerPrint> fingerprints) {
		this.fingerprints = fingerprints;
	}

	/**
	 * creates a new TextCategorizer with the given configuration file. the
	 * configuration file maps paths to FingerPrint files to categories which
	 * are used to categorize the texts passed to the TextCategorizer.
	 * 
	 * @param confURL
	 *            the URL to the configuration file
	 */
	public TextCategorizer(URL confURL) {
	  this.confURL = confURL;
	  loadCategories();
	}

	/**
	 * clears the categories-collection and fills it with the FingerPrints given
	 * in the configuration file.
	 */
	private void loadCategories() {
		this.fingerprints.clear();
		MyProperties properties = new MyProperties();
		
		try {
			properties.load(confURL.openStream());
		} catch (IOException e) {
			e.printStackTrace();
			return;
		}

		for (Entry<String, String> entry : properties.entrySet()) {
			FingerPrint fp;
			try {
		    URL fpURL = new URL(confURL,entry.getKey());
		    fp = new FingerPrint(fpURL.openStream());

				fp.setCategory(entry.getValue());
				this.fingerprints.add(fp);
			} catch (MalformedURLException mue) {
			  mue.printStackTrace();
			} catch (IOException ioe) {
			  ioe.printStackTrace();
			}
			catch (FingerPrintFileException e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * categorizes the text passed to it
	 * 
	 * @param text
	 *            text to be categorized
	 * @return the category name given in the configuration file
	 */
	public String categorize(String text) {
		if (text.length() < UNKNOWN_LIMIT) {
			return "unknown";
		}
		FingerPrint fp = new FingerPrint();
		fp.create(text);
		fp.categorize(fingerprints);

		return fp.getCategory();
	}

	/**
	 * categorizes only a certain amount of characters in the text. recommended
	 * when categorizing large texts in order to increase performance.
	 * 
	 * @param text
	 *            text to be analysed
	 * @param limit
	 *            number of characters to be analysed
	 * @return the category name given in the configuration file
	 */
	public String categorize(String text, int limit) {
		if (text.length() < UNKNOWN_LIMIT) {
			return "unknown";
		}
		if (limit > (text.length() - 1)) {
			return this.categorize(text);
		}
		return this.categorize(text.substring(0, limit));
	}

	/**
	 * categorizes a text but returns a map containing all categories and their
	 * distances to the text.
	 * 
	 * @param text
	 *            text to be categorized
	 * @return HashMap with categories as keys and distances as values
	 */
	public Map<String, Integer> getCategoryDistances(String text) {
		if (this.fingerprints.isEmpty()) {
			loadCategories();
		}
		FingerPrint fp = new FingerPrint();
		fp.create(text);
		fp.categorize(fingerprints);
		return fp.getCategoryDistances();
	}
}