1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan 03/04/2003
10   *
11   *  $Id: StringToNominalFilter.java,v 1.2 2003/05/23 09:52:09 valyt Exp $
12   *
13   */
14  
15  package gate.creole.ml.weka;
16  
17  import java.util.*;
18  import java.io.Serializable;
19  
20  import weka.core.*;
21  import weka.filters.*;
22  
23  /**
24   * This filter converts one or more string attributes from the input dataset
25   * into nominal attributes.
26   */
27  public class StringToNominalFilter extends Filter implements OptionHandler{
28    /**
29     * Anonymous constructor.
30     */
31    public StringToNominalFilter() {
32    }
33  
34    /**
35     * Sets the format of the input instances.
36     *
37     * @param instanceInfo an Instances object containing the input
38     * instance structure (any instances contained in the object are
39     * ignored - only the structure is required).
40     * @return <tt>false</tt> as this filter needs to see all the instances
41     * before being able to convert the input.
42     * @exception UnsupportedAttributeTypeException if the selected attribute
43     * is not a string attribute.
44     */
45    public boolean setInputFormat(Instances instanceInfo)
46         throws Exception {
47      super.setInputFormat(instanceInfo);
48      Iterator attIter = attributesData.iterator();
49      while(attIter.hasNext()){
50        AttributeData aData = (AttributeData)attIter.next();
51        if (!instanceInfo.attribute(aData.index).isString()) {
52          throw new UnsupportedAttributeTypeException(
53            "Attribute at selcted index " + aData.index +
54            " is not of type string!");
55        }
56      }
57      return false;
58    }
59  
60    /**
61     * Input an instance for filtering. The instance is processed
62     * and made available for output immediately.
63     *
64     * @param instance the input instance.
65     * @return true if the filtered instance may now be
66     * collected with output().
67     * @exception IllegalStateException if no input structure has been defined.
68     */
69    public boolean input(Instance instance) {
70      if (getInputFormat() == null) {
71        throw new IllegalStateException("No input instance format defined");
72      }
73  
74      if (m_NewBatch) {
75        resetQueue();
76        m_NewBatch = false;
77      }
78  
79      bufferInput(instance);
80      return false;
81    }
82  
83    /**
84     * Signifies that this batch of input to the filter is finished. If the
85     * filter requires all instances prior to filtering, output() may now
86     * be called to retrieve the filtered instances.
87     *
88     * @return true if there are instances pending output.
89     * @exception IllegalStateException if no input structure has been defined.
90     */
91    public boolean batchFinished() {
92      if (getInputFormat() == null) {
93        throw new IllegalStateException("No input instance format defined");
94      }
95      //do the maths
96      buildOutputFormat();
97  
98      // Convert pending input instances
99      for(int i = 0; i < getInputFormat().numInstances(); i++) {
100       push(processInstance(getInputFormat().instance(i)));
101     }
102 
103     flushInput();
104     m_NewBatch = true;
105     return (numPendingOutput() != 0);
106   }
107 
108 
109   /**
110    * Called after a batch of input has finished. Will perform all the necessary
111    * calculations to define the output format and build the data needed in order
112    * to convert the input instances into output.
113    */
114   protected void buildOutputFormat(){
115     //build the new instances value
116     //collect the frequency data for all the words into a map
117     //(String)word -> (Integer)Attribute Index -> (Double)Class value ->
118     //->(Integer)count
119     Map wordData = new HashMap();
120     //for all input instances
121     for(int i = 0; i < getInputFormat().numInstances(); i++) {
122       Instance instance = getInputFormat().instance(i);
123       //for all attributes that need processing
124       Iterator attIter = attributesData.iterator();
125       while(attIter.hasNext()){
126         AttributeData aData = (AttributeData)attIter.next();
127         String word = instance.stringValue(aData.index);
128         //get the map for this word
129         Map wMap = (Map)wordData.get(word);
130         if(wMap == null){
131           wMap = new HashMap();
132           wordData.put(word, wMap);
133         }
134         //get the map for word->attribute
135         Integer attIndex = new Integer(aData.index);
136         Map w_aMap = (Map)wMap.get(attIndex);
137         if(w_aMap == null){
138           w_aMap = new HashMap();
139           wMap.put(attIndex, w_aMap);
140         }
141         //get the count for word->attribute->class
142         Double classValue = new Double(instance.classValue());
143         WordData w_a_cCount = (WordData)w_aMap.get(classValue);
144         //increment the count
145         if(w_a_cCount == null){
146           w_a_cCount = new WordData(word, aData.index, classValue, 1);
147           w_aMap.put(classValue, w_a_cCount);
148         }else{
149           w_a_cCount.inc();
150         }
151       }
152     }
153     // Compute new attributes
154     Instances newData;
155     FastVector newAtts, newVals;
156     //start with a copy of the initial dataset header
157     newAtts = new FastVector(getInputFormat().numAttributes());
158     for (int i = 0; i < getInputFormat().numAttributes(); i++) {
159       Attribute att = getInputFormat().attribute(i);
160       newAtts.addElement(att.copy());
161     }
162 
163     //replace the filtered attributes
164     Iterator attIter = attributesData.iterator();
165     while(attIter.hasNext()){
166       AttributeData aData = (AttributeData)attIter.next();
167       FastVector values = new FastVector(aData.maxCount);
168       if(aData.method.equalsIgnoreCase(FREQUENCY)){
169         List wordFreqs = new ArrayList(wordData.size());
170         Iterator entryIter = wordData.entrySet().iterator();
171         while(entryIter.hasNext()){
172           Map.Entry entry = (Map.Entry)entryIter.next();
173           String word = (String)entry.getKey();
174           Map w_aMap = (Map)((Map)entry.getValue()).get(new Integer(aData.index));
175           int count = addLeaves(w_aMap);
176           wordFreqs.add(new WordCount(word, count));
177         }
178         int start = 0;
179         if(wordFreqs.size() > aData.maxCount){
180           Collections.sort(wordFreqs);
181           start = wordFreqs.size() - aData.maxCount;
182         }
183         for(int i = wordFreqs.size() -1; i >= start; i--){
184           values.addElement(((WordCount)wordFreqs.get(i)).word);
185         }
186 System.out.println("Values count" + values.size());
187       }else if(aData.method.equalsIgnoreCase(TFIDF)){
188         int classCount = getInputFormat().classAttribute().numValues();
189         List wordTFIDFValues = new ArrayList(wordData.size());
190         Iterator entryIter = wordData.entrySet().iterator();
191         while(entryIter.hasNext()){
192           Map.Entry entry = (Map.Entry)entryIter.next();
193           String word = (String)entry.getKey();
194           Map w_aMap = (Map)((Map)entry.getValue()).get(new Integer(aData.index));
195           if(w_aMap == null || w_aMap.isEmpty()) continue;
196           int count = addLeaves(w_aMap);
197           int classFreq = w_aMap.size();
198           double tfidf = count * Math.log(classCount/classFreq);
199           wordTFIDFValues.add(new WordCount(word, count, tfidf));
200         }
201         int start = 0;
202         if(wordTFIDFValues.size() > aData.maxCount){
203           Collections.sort(wordTFIDFValues, new Comparator(){
204             public int compare(Object o1, Object o2){
205               double value = ((WordCount)o1).tfidf - ((WordCount)o2).tfidf;
206               if(value > Utils.SMALL) return 1;
207               else if(value < -Utils.SMALL) return -1;
208               else return 0;
209             }
210           });
211           start = wordTFIDFValues.size() - aData.maxCount;
212         }
213         for(int i = wordTFIDFValues.size() -1; i >= start; i--){
214           values.addElement(((WordCount)wordTFIDFValues.get(i)).word);
215         }
216 
217       }
218       Attribute oldAttr = (Attribute)newAtts.elementAt(aData.index);
219       Attribute newAttribute = new Attribute(oldAttr.name(), values);
220 System.out.println("Atribute \"" + newAttribute.name() + "\":" + values.size());
221       newAtts.setElementAt(newAttribute, aData.index);
222     }
223 
224     // Construct new header
225     newData = new Instances(getInputFormat().relationName(), newAtts, 0);
226     newData.setClassIndex(getInputFormat().classIndex());
227     setOutputFormat(newData);
228   }
229 
230   public static void main(String[] args){
231     try{
232       StringToNominalFilter filter = new StringToNominalFilter();
233       filter.setOptions(new String[]{"-A", "10,200,TFIDF",
234                                      "-A", "11,200,TFIDF",
235                                      "-A", "12,200,TFIDF"});
236       Instances input = new Instances(
237         new java.io.FileReader("D:\\tmp\\ML-Weka\\Strings\\MUC7dataset.arff"));
238       input.setClassIndex(18);
239       filter.setInputFormat(input);
240       for(int i = 0; i < input.numInstances(); i++){
241         filter.input(input.instance(i));
242       }
243       filter.batchFinished();
244       Instances output = filter.getOutputFormat();
245       Instance instance = filter.output();
246       while(instance != null){
247         output.add(instance);
248         instance = filter.output();
249       }
250       java.io.FileWriter fw = new java.io.FileWriter("D:\\tmp\\ML-Weka\\Strings\\MUC7dataset.filtered.arff");
251       fw.write(output.toString());
252       fw.flush();
253       fw.close();
254     }catch(Exception e){
255       e.printStackTrace();
256     }
257   }
258 
259   protected int addLeaves(Map map){
260     int res = 0;
261     Iterator valuesIter = map.values().iterator();
262     while(valuesIter.hasNext()){
263       Object value = valuesIter.next();
264       if(value instanceof WordData) res += ((WordData)value).count;
265       else if(value instanceof Map) res += addLeaves((Map)value);
266     }
267     return res;
268   }
269 
270   /**
271    * Once the output format is defined this method can be used to covert
272    * input instances into output instances.
273    * @param inputInstance
274    * @return the coverted output instance.
275    */
276   protected Instance processInstance(Instance inputInstance){
277     Instance  newInstance = new Instance(getOutputFormat().numAttributes());
278     newInstance.setDataset(getOutputFormat());
279     for(int i = 0; i < getOutputFormat().numAttributes(); i++){
280       if(inputInstance.isMissing(i)) newInstance.setMissing(i);
281       else{
282         if(isString(i)){
283           String value = inputInstance.stringValue(i);
284           if(getOutputFormat().attribute(i).indexOfValue(value) == -1){
285             newInstance.setMissing(i);
286           }else{
287             newInstance.setValue(i, value);
288           }
289         }else{
290           newInstance.setValue(i, inputInstance.value(i));
291         }
292       }
293     }
294     return newInstance;
295   }
296 
297   /**
298    * Checks whether the aqttribute at a particular index in the input dataset
299    * is string.
300    * @param index
301    * @return
302    */
303   protected boolean isString(int index){
304     int[] stringIndices = getInputStringIndex();
305     for(int i = 0; i < stringIndices.length; i++)
306       if(stringIndices[i] == index) return true;
307     return false;
308   }
309 
310   public Enumeration listOptions() {
311     return optionsDesc.elements();
312   }
313 
314   public void setOptions(String[] options) throws java.lang.Exception {
315     this.options = options;
316     parseOptions();
317 Iterator itr = attributesData.iterator();
318 while(itr.hasNext()){
319   AttributeData aData = (AttributeData)itr.next();
320   System.out.println("Attribute " + aData.index + " " + aData.maxCount + " " + aData.method);
321 }
322   }
323 
324   public String[] getOptions() {
325     return options;
326   }
327 
328   /**
329    * Parses the set of options supplied to this filter
330    */
331   protected void parseOptions() throws Exception{
332     attributesData = new ArrayList();
333     String option = Utils.getOption('A', options);
334 System.out.print("Option " + option);
335     while(option != null && option.length() > 0){
336       StringTokenizer strTok = new StringTokenizer(option, ",", false);
337       int index = Integer.parseInt(strTok.nextToken());
338 System.out.print(": " + index);
339       int maxCnt = Integer.parseInt(strTok.nextToken());
340 System.out.print(": " + maxCnt);
341       //check if we got a method
342       String method = null;
343       if(strTok.hasMoreTokens()){
344         method = strTok.nextToken();
345         if(!method.equalsIgnoreCase(FREQUENCY) &&
346            !method.equalsIgnoreCase(TFIDF)){
347           throw new Exception("Unknown filtering method: " + method);
348         }
349       }
350       attributesData.add(new AttributeData(index, maxCnt, method));
351       //get the next "-A" option
352       option = Utils.getOption('A', options);
353     }
354   }
355 
356   /**
357    * Stores data about one attribute to be converted.
358    */
359   protected static class AttributeData implements Serializable{
360     public AttributeData(int index, int count, String method){
361       this.index = index;
362       this.maxCount = count;
363       this.method = method;
364     }
365 
366     int index;
367     int maxCount;
368     String method;
369   }
370 
371   protected static class WordData{
372     public WordData(String word, int attrIndex, Double classValue, int count){
373       this.word = word;
374       this.attributeIndex = attrIndex;
375       this.classValue = classValue;
376       this.count = count;
377     }
378 
379     public void inc(){
380       count ++;
381     }
382     String word;
383     int attributeIndex;
384     Double classValue;
385     int count;
386   }
387 
388   protected static class WordCount implements Comparable{
389     public WordCount(String word, int count){
390       this.word = word;
391       this.count = count;
392       tfidf = -1;
393     }
394 
395     public WordCount(String word, int count, double tfidf){
396       this.word = word;
397       this.count = count;
398       this.tfidf = tfidf;
399     }
400 
401     public int compareTo(Object other){
402       return count - ((WordCount)other).count;
403     }
404 
405     String word;
406     int count;
407     double tfidf;
408   }
409 
410   /**
411    * The options set on this filter.
412    */
413   private String[] options;
414 
415   protected List attributesData;
416   /**
417    * The description for the options accepted by this filter
418    */
419   protected static Vector optionsDesc;
420 
421   /**
422    * Constant for conversion method.
423    */
424   public static final String FREQUENCY = "FREQ";
425 
426   /**
427    * Constant for conversion method.
428    */
429   public static final String TFIDF = "TFIDF";
430   /**
431    * Static initialiser: creates the description for this filter's options.
432    */
433   static{
434     optionsDesc = new Vector(1);
435     Option option = new Option(
436     "Selects one attribute for conversion. " +
437     "The optional <method> argument can be one of FREQ or TFIDF " +
438     "(the default is FREQ). " +
439     "This option can be repeated for as many attributes as necessary.",
440     "A", 1, "-A <index>,<max count>[,<method>] ...");
441     optionsDesc.add(option);
442   }
443 }