|
StringToNominalFilter |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan 03/04/2003 10 * 11 * $Id: StringToNominalFilter.java,v 1.2 2003/05/23 09:52:09 valyt Exp $ 12 * 13 */ 14 15 package gate.creole.ml.weka; 16 17 import java.util.*; 18 import java.io.Serializable; 19 20 import weka.core.*; 21 import weka.filters.*; 22 23 /** 24 * This filter converts one or more string attributes from the input dataset 25 * into nominal attributes. 26 */ 27 public class StringToNominalFilter extends Filter implements OptionHandler{ 28 /** 29 * Anonymous constructor. 30 */ 31 public StringToNominalFilter() { 32 } 33 34 /** 35 * Sets the format of the input instances. 36 * 37 * @param instanceInfo an Instances object containing the input 38 * instance structure (any instances contained in the object are 39 * ignored - only the structure is required). 40 * @return <tt>false</tt> as this filter needs to see all the instances 41 * before being able to convert the input. 42 * @exception UnsupportedAttributeTypeException if the selected attribute 43 * is not a string attribute. 44 */ 45 public boolean setInputFormat(Instances instanceInfo) 46 throws Exception { 47 super.setInputFormat(instanceInfo); 48 Iterator attIter = attributesData.iterator(); 49 while(attIter.hasNext()){ 50 AttributeData aData = (AttributeData)attIter.next(); 51 if (!instanceInfo.attribute(aData.index).isString()) { 52 throw new UnsupportedAttributeTypeException( 53 "Attribute at selcted index " + aData.index + 54 " is not of type string!"); 55 } 56 } 57 return false; 58 } 59 60 /** 61 * Input an instance for filtering. The instance is processed 62 * and made available for output immediately. 63 * 64 * @param instance the input instance. 65 * @return true if the filtered instance may now be 66 * collected with output(). 67 * @exception IllegalStateException if no input structure has been defined. 68 */ 69 public boolean input(Instance instance) { 70 if (getInputFormat() == null) { 71 throw new IllegalStateException("No input instance format defined"); 72 } 73 74 if (m_NewBatch) { 75 resetQueue(); 76 m_NewBatch = false; 77 } 78 79 bufferInput(instance); 80 return false; 81 } 82 83 /** 84 * Signifies that this batch of input to the filter is finished. If the 85 * filter requires all instances prior to filtering, output() may now 86 * be called to retrieve the filtered instances. 87 * 88 * @return true if there are instances pending output. 89 * @exception IllegalStateException if no input structure has been defined. 90 */ 91 public boolean batchFinished() { 92 if (getInputFormat() == null) { 93 throw new IllegalStateException("No input instance format defined"); 94 } 95 //do the maths 96 buildOutputFormat(); 97 98 // Convert pending input instances 99 for(int i = 0; i < getInputFormat().numInstances(); i++) { 100 push(processInstance(getInputFormat().instance(i))); 101 } 102 103 flushInput(); 104 m_NewBatch = true; 105 return (numPendingOutput() != 0); 106 } 107 108 109 /** 110 * Called after a batch of input has finished. Will perform all the necessary 111 * calculations to define the output format and build the data needed in order 112 * to convert the input instances into output. 113 */ 114 protected void buildOutputFormat(){ 115 //build the new instances value 116 //collect the frequency data for all the words into a map 117 //(String)word -> (Integer)Attribute Index -> (Double)Class value -> 118 //->(Integer)count 119 Map wordData = new HashMap(); 120 //for all input instances 121 for(int i = 0; i < getInputFormat().numInstances(); i++) { 122 Instance instance = getInputFormat().instance(i); 123 //for all attributes that need processing 124 Iterator attIter = attributesData.iterator(); 125 while(attIter.hasNext()){ 126 AttributeData aData = (AttributeData)attIter.next(); 127 String word = instance.stringValue(aData.index); 128 //get the map for this word 129 Map wMap = (Map)wordData.get(word); 130 if(wMap == null){ 131 wMap = new HashMap(); 132 wordData.put(word, wMap); 133 } 134 //get the map for word->attribute 135 Integer attIndex = new Integer(aData.index); 136 Map w_aMap = (Map)wMap.get(attIndex); 137 if(w_aMap == null){ 138 w_aMap = new HashMap(); 139 wMap.put(attIndex, w_aMap); 140 } 141 //get the count for word->attribute->class 142 Double classValue = new Double(instance.classValue()); 143 WordData w_a_cCount = (WordData)w_aMap.get(classValue); 144 //increment the count 145 if(w_a_cCount == null){ 146 w_a_cCount = new WordData(word, aData.index, classValue, 1); 147 w_aMap.put(classValue, w_a_cCount); 148 }else{ 149 w_a_cCount.inc(); 150 } 151 } 152 } 153 // Compute new attributes 154 Instances newData; 155 FastVector newAtts, newVals; 156 //start with a copy of the initial dataset header 157 newAtts = new FastVector(getInputFormat().numAttributes()); 158 for (int i = 0; i < getInputFormat().numAttributes(); i++) { 159 Attribute att = getInputFormat().attribute(i); 160 newAtts.addElement(att.copy()); 161 } 162 163 //replace the filtered attributes 164 Iterator attIter = attributesData.iterator(); 165 while(attIter.hasNext()){ 166 AttributeData aData = (AttributeData)attIter.next(); 167 FastVector values = new FastVector(aData.maxCount); 168 if(aData.method.equalsIgnoreCase(FREQUENCY)){ 169 List wordFreqs = new ArrayList(wordData.size()); 170 Iterator entryIter = wordData.entrySet().iterator(); 171 while(entryIter.hasNext()){ 172 Map.Entry entry = (Map.Entry)entryIter.next(); 173 String word = (String)entry.getKey(); 174 Map w_aMap = (Map)((Map)entry.getValue()).get(new Integer(aData.index)); 175 int count = addLeaves(w_aMap); 176 wordFreqs.add(new WordCount(word, count)); 177 } 178 int start = 0; 179 if(wordFreqs.size() > aData.maxCount){ 180 Collections.sort(wordFreqs); 181 start = wordFreqs.size() - aData.maxCount; 182 } 183 for(int i = wordFreqs.size() -1; i >= start; i--){ 184 values.addElement(((WordCount)wordFreqs.get(i)).word); 185 } 186 System.out.println("Values count" + values.size()); 187 }else if(aData.method.equalsIgnoreCase(TFIDF)){ 188 int classCount = getInputFormat().classAttribute().numValues(); 189 List wordTFIDFValues = new ArrayList(wordData.size()); 190 Iterator entryIter = wordData.entrySet().iterator(); 191 while(entryIter.hasNext()){ 192 Map.Entry entry = (Map.Entry)entryIter.next(); 193 String word = (String)entry.getKey(); 194 Map w_aMap = (Map)((Map)entry.getValue()).get(new Integer(aData.index)); 195 if(w_aMap == null || w_aMap.isEmpty()) continue; 196 int count = addLeaves(w_aMap); 197 int classFreq = w_aMap.size(); 198 double tfidf = count * Math.log(classCount/classFreq); 199 wordTFIDFValues.add(new WordCount(word, count, tfidf)); 200 } 201 int start = 0; 202 if(wordTFIDFValues.size() > aData.maxCount){ 203 Collections.sort(wordTFIDFValues, new Comparator(){ 204 public int compare(Object o1, Object o2){ 205 double value = ((WordCount)o1).tfidf - ((WordCount)o2).tfidf; 206 if(value > Utils.SMALL) return 1; 207 else if(value < -Utils.SMALL) return -1; 208 else return 0; 209 } 210 }); 211 start = wordTFIDFValues.size() - aData.maxCount; 212 } 213 for(int i = wordTFIDFValues.size() -1; i >= start; i--){ 214 values.addElement(((WordCount)wordTFIDFValues.get(i)).word); 215 } 216 217 } 218 Attribute oldAttr = (Attribute)newAtts.elementAt(aData.index); 219 Attribute newAttribute = new Attribute(oldAttr.name(), values); 220 System.out.println("Atribute \"" + newAttribute.name() + "\":" + values.size()); 221 newAtts.setElementAt(newAttribute, aData.index); 222 } 223 224 // Construct new header 225 newData = new Instances(getInputFormat().relationName(), newAtts, 0); 226 newData.setClassIndex(getInputFormat().classIndex()); 227 setOutputFormat(newData); 228 } 229 230 public static void main(String[] args){ 231 try{ 232 StringToNominalFilter filter = new StringToNominalFilter(); 233 filter.setOptions(new String[]{"-A", "10,200,TFIDF", 234 "-A", "11,200,TFIDF", 235 "-A", "12,200,TFIDF"}); 236 Instances input = new Instances( 237 new java.io.FileReader("D:\\tmp\\ML-Weka\\Strings\\MUC7dataset.arff")); 238 input.setClassIndex(18); 239 filter.setInputFormat(input); 240 for(int i = 0; i < input.numInstances(); i++){ 241 filter.input(input.instance(i)); 242 } 243 filter.batchFinished(); 244 Instances output = filter.getOutputFormat(); 245 Instance instance = filter.output(); 246 while(instance != null){ 247 output.add(instance); 248 instance = filter.output(); 249 } 250 java.io.FileWriter fw = new java.io.FileWriter("D:\\tmp\\ML-Weka\\Strings\\MUC7dataset.filtered.arff"); 251 fw.write(output.toString()); 252 fw.flush(); 253 fw.close(); 254 }catch(Exception e){ 255 e.printStackTrace(); 256 } 257 } 258 259 protected int addLeaves(Map map){ 260 int res = 0; 261 Iterator valuesIter = map.values().iterator(); 262 while(valuesIter.hasNext()){ 263 Object value = valuesIter.next(); 264 if(value instanceof WordData) res += ((WordData)value).count; 265 else if(value instanceof Map) res += addLeaves((Map)value); 266 } 267 return res; 268 } 269 270 /** 271 * Once the output format is defined this method can be used to covert 272 * input instances into output instances. 273 * @param inputInstance 274 * @return the coverted output instance. 275 */ 276 protected Instance processInstance(Instance inputInstance){ 277 Instance newInstance = new Instance(getOutputFormat().numAttributes()); 278 newInstance.setDataset(getOutputFormat()); 279 for(int i = 0; i < getOutputFormat().numAttributes(); i++){ 280 if(inputInstance.isMissing(i)) newInstance.setMissing(i); 281 else{ 282 if(isString(i)){ 283 String value = inputInstance.stringValue(i); 284 if(getOutputFormat().attribute(i).indexOfValue(value) == -1){ 285 newInstance.setMissing(i); 286 }else{ 287 newInstance.setValue(i, value); 288 } 289 }else{ 290 newInstance.setValue(i, inputInstance.value(i)); 291 } 292 } 293 } 294 return newInstance; 295 } 296 297 /** 298 * Checks whether the aqttribute at a particular index in the input dataset 299 * is string. 300 * @param index 301 * @return 302 */ 303 protected boolean isString(int index){ 304 int[] stringIndices = getInputStringIndex(); 305 for(int i = 0; i < stringIndices.length; i++) 306 if(stringIndices[i] == index) return true; 307 return false; 308 } 309 310 public Enumeration listOptions() { 311 return optionsDesc.elements(); 312 } 313 314 public void setOptions(String[] options) throws java.lang.Exception { 315 this.options = options; 316 parseOptions(); 317 Iterator itr = attributesData.iterator(); 318 while(itr.hasNext()){ 319 AttributeData aData = (AttributeData)itr.next(); 320 System.out.println("Attribute " + aData.index + " " + aData.maxCount + " " + aData.method); 321 } 322 } 323 324 public String[] getOptions() { 325 return options; 326 } 327 328 /** 329 * Parses the set of options supplied to this filter 330 */ 331 protected void parseOptions() throws Exception{ 332 attributesData = new ArrayList(); 333 String option = Utils.getOption('A', options); 334 System.out.print("Option " + option); 335 while(option != null && option.length() > 0){ 336 StringTokenizer strTok = new StringTokenizer(option, ",", false); 337 int index = Integer.parseInt(strTok.nextToken()); 338 System.out.print(": " + index); 339 int maxCnt = Integer.parseInt(strTok.nextToken()); 340 System.out.print(": " + maxCnt); 341 //check if we got a method 342 String method = null; 343 if(strTok.hasMoreTokens()){ 344 method = strTok.nextToken(); 345 if(!method.equalsIgnoreCase(FREQUENCY) && 346 !method.equalsIgnoreCase(TFIDF)){ 347 throw new Exception("Unknown filtering method: " + method); 348 } 349 } 350 attributesData.add(new AttributeData(index, maxCnt, method)); 351 //get the next "-A" option 352 option = Utils.getOption('A', options); 353 } 354 } 355 356 /** 357 * Stores data about one attribute to be converted. 358 */ 359 protected static class AttributeData implements Serializable{ 360 public AttributeData(int index, int count, String method){ 361 this.index = index; 362 this.maxCount = count; 363 this.method = method; 364 } 365 366 int index; 367 int maxCount; 368 String method; 369 } 370 371 protected static class WordData{ 372 public WordData(String word, int attrIndex, Double classValue, int count){ 373 this.word = word; 374 this.attributeIndex = attrIndex; 375 this.classValue = classValue; 376 this.count = count; 377 } 378 379 public void inc(){ 380 count ++; 381 } 382 String word; 383 int attributeIndex; 384 Double classValue; 385 int count; 386 } 387 388 protected static class WordCount implements Comparable{ 389 public WordCount(String word, int count){ 390 this.word = word; 391 this.count = count; 392 tfidf = -1; 393 } 394 395 public WordCount(String word, int count, double tfidf){ 396 this.word = word; 397 this.count = count; 398 this.tfidf = tfidf; 399 } 400 401 public int compareTo(Object other){ 402 return count - ((WordCount)other).count; 403 } 404 405 String word; 406 int count; 407 double tfidf; 408 } 409 410 /** 411 * The options set on this filter. 412 */ 413 private String[] options; 414 415 protected List attributesData; 416 /** 417 * The description for the options accepted by this filter 418 */ 419 protected static Vector optionsDesc; 420 421 /** 422 * Constant for conversion method. 423 */ 424 public static final String FREQUENCY = "FREQ"; 425 426 /** 427 * Constant for conversion method. 428 */ 429 public static final String TFIDF = "TFIDF"; 430 /** 431 * Static initialiser: creates the description for this filter's options. 432 */ 433 static{ 434 optionsDesc = new Vector(1); 435 Option option = new Option( 436 "Selects one attribute for conversion. " + 437 "The optional <method> argument can be one of FREQ or TFIDF " + 438 "(the default is FREQ). " + 439 "This option can be repeated for as many attributes as necessary.", 440 "A", 1, "-A <index>,<max count>[,<method>] ..."); 441 optionsDesc.add(option); 442 } 443 }
|
StringToNominalFilter |
|