package at.knallgrau.textcat; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.Collection; import java.util.HashMap; import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author Thomas Hammerl A FingerPrint maps so called NGrams to their number of * occurences in the corresponding text. It is able to categorize itself * by comparing its FingerPrint with the FingerPrints of a collection of * categories. See sdair-94-bc.pdf in the doc direcory of the jar-file * for more information. * */ public class FingerPrint { private Map<String, Integer> ngrams = new HashMap<String, Integer>(); private final Pattern filePattern = Pattern.compile("-(.*)\\.lm$"); private final Pattern pattern = Pattern.compile("^_?[^0-9\\?!\\-_/]*_?$"); private String category = "unknown"; private Map<String, Integer> categoryDistances = new HashMap<String, Integer>(); /** * Set of NGrams sorted by the number of occurences in the text which * was used for creating the FingerPrint. * */ private SortedSet<Entry<String, Integer>> entries = new TreeSet<Entry<String, Integer>>( new NGramEntryComparator());; public FingerPrint() { } /** * creates a FingerPrint by reading the FingerPrint-file referenced by * the passed path. * * @param file * path to the FingerPrint-file * @throws FingerPrintFileException */ public FingerPrint(String file) throws FingerPrintFileException { this.loadFingerPrintFromFile(file); } /** * creates a FingerPrint by reading it with the passed InputStream * * @param is * InputStream for reading the FingerPrint * @throws FingerPrintFileException */ public FingerPrint(InputStream is) throws FingerPrintFileException { this.loadFingerPrintFromInputStream(is); } public FingerPrint(InputStream is, String encoding) throws FingerPrintFileException { this.loadFingerPrintFromInputStream(is, encoding); } /** * creates a FingerPrint by analysing the content of the given file. * * @param file * file to be analysed * @throws FileNotFoundException * thrown when given file does not exist */ public void create(File file) throws FileNotFoundException { char[] data = new char[1024]; String s = ""; int read; FileReader fr = new FileReader(file); try { while ((read = fr.read(data)) != -1) { s += new String(data, 0, read); } fr.close(); } catch (IOException ioe) { ioe.printStackTrace(); return; } this.create(s); } /** * fills the FingerPrint with all the NGrams and their numer of * occurences in the passed text. * * @param text * text to be analysed */ public void create(String text) { this.ngrams.clear(); this.computeNGrams(1, 5, text); if (this.ngrams.containsKey("_")) { int blanksScore = this.ngrams.remove("_"); this.ngrams.put("_", blanksScore / 2); } this.entries.clear(); this.entries.addAll(this.ngrams.entrySet()); } /** * adds all NGrams with the passed order occuring in the given text to * the FingerPrint. For example: * * text = "text" startOrder = 2, maxOrder = 2 * * so the NGrams added to the FingerPrint are: * * "_t", "te", "ex", "xt", "t_" * * all with a score (occurence) of 1 * * @param startOrder * @param maxOrder * @param text */ private void computeNGrams(int startOrder, int maxOrder, String text) { String[] tokens = text.split("\\s"); for (int order = startOrder; order <= maxOrder; ++order) { for (String token : tokens) { token = "_" + token + "_"; for (int i = 0; i < (token.length() - order + 1); i++) { String ngram = token.substring(i, i + order); Matcher matcher = pattern.matcher(ngram); if (!matcher.find()) { continue; } else if (!this.ngrams.containsKey(ngram)) { this.ngrams.put(ngram, 1); } else { int score = this.ngrams.remove(ngram); this.ngrams.put(ngram, ++score); } } } } } /** * categorizes the FingerPrint by computing the distance to the * FingerPrints in the passed Collection. the category of the * FingerPrint with the lowest distance is assigned to this FingerPrint. * * @param categories */ public Map<String, Integer> categorize(Collection<FingerPrint> categories) { int minDistance = Integer.MAX_VALUE; for (FingerPrint fp : categories) { int distance = this.computeDistanceTo(fp); this.getCategoryDistances().put(fp.getCategory(), distance); if (distance < minDistance) { minDistance = distance; this.category = fp.getCategory(); } } return this.getCategoryDistances(); } public Map<String, Integer> getCategoryDistances() { return this.categoryDistances; } /** * computes and returns the distance of this FingerPrint to the * FingerPrint passed to the method. * * @param category * the FingerPrint to be compared to this one * @return the distance of the passed FingerPrint to this FingerPrint */ private int computeDistanceTo(FingerPrint category) { int distance = 0; int count = 0; for (Entry<String, Integer> entry : this.entries) { String ngram = entry.getKey(); count++; if (count > 400) { break; } if (!category.containsNgram(ngram)) { distance += category.numNgrams(); } else { distance += Math.abs(this.getPosition(ngram) - category.getPosition(ngram)); } } return distance; } public boolean containsNgram(String ngram) { return this.ngrams.containsKey(ngram); } public int numNgrams() { return this.ngrams.size(); } /** * reads a FingerPrint from the passed InputStream * * @param is * InputStream to be read * @throws FingerPrintFileException */ private void loadFingerPrintFromInputStream(InputStream is) throws FingerPrintFileException { this.loadFingerPrintFromInputStream(is, "UTF-8"); } private void loadFingerPrintFromInputStream(InputStream is, String encoding) throws FingerPrintFileException { this.entries.clear(); MyProperties properties = new MyProperties(); try { String line; InputStreamReader isr = new InputStreamReader(is, encoding); BufferedReader reader = new BufferedReader(isr); while ((line = reader.readLine()) != null) { if (!line.equals("")) { String[] property = line.split("\\s+"); if (property.length >= 2) { properties.put(property[0], property[1]); } } } /* properties.load(is); */ for (Entry<String, String> entry : properties.entrySet()) { this.ngrams.put(entry.getKey(), Integer.parseInt(entry .getValue())); } entries.addAll(this.ngrams.entrySet()); } catch (UnsupportedEncodingException e) { throw new FingerPrintFileException(e); } catch (IOException e) { throw new FingerPrintFileException(e); } } /** * reads a FingerPrint from the file referenced by the passed path * * @param file * FingerPrint file to be read * @throws FingerPrintFileException */ private void loadFingerPrintFromFile(String file) throws FingerPrintFileException { File fpFile = new File(file); if (!fpFile.isDirectory()) { try { String encoding = null; File f = new File(file); Matcher matcher = filePattern.matcher(f.getName()); if (matcher.matches()) { encoding = matcher.group(1); } FileInputStream fis = new FileInputStream(file.toString()); if (encoding != null) { this.loadFingerPrintFromInputStream(fis, encoding); } else { this.loadFingerPrintFromInputStream(fis); } } catch (FileNotFoundException e) { throw new FingerPrintFileException(e); } } } /** * gets the position of the NGram passed to method in the FingerPrint. * the NGrams are in descending order according to the number of * occurences in the text which was used creating the FingerPrint. * * @param key * the NGram * @return the position of the NGram in the FingerPrint */ public int getPosition(String key) { int pos = 1; int value = this.entries.first().getValue(); for (Entry<String, Integer> entry : this.entries) { if (value != entry.getValue()) { value = entry.getValue(); pos++; } if (entry.getKey().equals(key)) { return pos; } } return -1; } /** * saves the fingerprint to a file named <categoryname>.lm in the * execution path. */ public void save() { File file = new File(this.getCategory() + "-utf8.lm"); try { if (file.createNewFile()) { FileOutputStream fos = new FileOutputStream(file); fos.write(this.toString().getBytes("utf8")); fos.close(); } } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } } /** * returns the category of the FingerPrint or "unknown" if the * FingerPrint wasn't categorized yet. * * @return the category of the FingerPrint */ public String getCategory() { return this.category; } /** * returns the FingerPrint as a String in the FingerPrint file-format */ public String toString() { String s = ""; for (Entry<String, Integer> entry : entries) { s += entry.getKey() + "\t" + entry.getValue() + "\n"; } return s; } /** * sets the category of the FingerPrint * * @param category * the category */ protected void setCategory(String category) { this.category = category; } }