1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan 28 May 2002
10   *
11   *  $Id: DataCollector.java,v 1.1 2002/06/27 17:12:32 valyt Exp $
12   */
13  package gate.ml;
14  
15  import java.util.*;
16  import javax.xml.parsers.*;
17  import java.net.*;
18  
19  import gate.*;
20  import gate.util.*;
21  import gate.creole.*;
22  
23  import weka.core.*;
24  /**
25   * Collects training data from a corpus.
26   * It iterates through the offsets in the annotation set and uses an instance
27   * detector to find instances and a set of attribute detectors to find the
28   * associated attributes.
29   */
30  public class DataCollector extends AbstractLanguageAnalyser {
31    /**
32     * Default constructor.
33     */
34    public DataCollector() {
35      attributeDetectors = new ArrayList();
36      inputTypes = new ArrayList();
37    }
38  
39    /**
40     * Gets the annotations that start at a given offset.
41     * Returns null or an empty list if none found.
42     * @param offset a Long value
43     * @return a Set value.
44     */
45    public Set getStartingAnnotations(Long offset){
46      AnnotationsLists existingAnnotations = (AnnotationsLists)
47                                             annotationsByOffset.get(offset);
48      if(existingAnnotations == null) return null;
49      else return existingAnnotations.startingAnnotations;
50    }
51  
52    /**
53     * Gets the annotations that end at a given offset.
54     * Returns null or an empty list if none found.
55     * @param offset a Long value
56     * @return a Set value.
57     */
58    public Set getEndingAnnotations(Long offset){
59      AnnotationsLists existingAnnotations = (AnnotationsLists)
60                                             annotationsByOffset.get(offset);
61      if(existingAnnotations == null) return null;
62      else return existingAnnotations.endingAnnotations;
63    }
64  
65    /**
66     * Gets the next offset for a given offset using the natural ordering.
67     * @param offset a Long value
68     * @return a Long value.
69     */
70    public Long nextOffset(Long offset){
71      if(annotationsByOffset == null ||
72         annotationsByOffset.isEmpty()) return null;
73      SortedMap tailMap = annotationsByOffset.tailMap(
74                          new Long(offset.longValue() + 1));
75      return (Long)((tailMap == null || tailMap.isEmpty()) ? null :
76                                                             tailMap.firstKey());
77    }
78  
79    /**
80     * Gets the next offset for a given offset using the natural ordering.
81     * @param offset a Long value
82     * @return a Long value.
83     */
84    public Long previousOffset(Long offset){
85      if(annotationsByOffset == null ||
86         annotationsByOffset.isEmpty()) return null;
87      SortedMap headMap = annotationsByOffset.subMap(
88                            annotationsByOffset.firstKey(), offset);
89      return (Long)((headMap == null || headMap.isEmpty()) ? null :
90                                                             headMap.lastKey());
91    }
92  
93    public void execute() throws ExecutionException{
94      //check the input
95      if(document == null)
96        throw new ExecutionException("No document to process!");
97      if(annotationSetName == null ||
98         annotationSetName.equals("")) annotationSet = document.getAnnotations();
99      else annotationSet = document.getAnnotations(annotationSetName);
100 
101 
102     fireStatusChanged("Extracting data from " + document.getName() + "...");
103 
104     //get all the relevant offsets
105     annotationsByOffset = new TreeMap();
106 
107     Iterator annIter = annotationSet.iterator();
108     while(annIter.hasNext()){
109       Annotation annotation = (Annotation)annIter.next();
110       Long startOffset = annotation.getStartNode().getOffset();
111       AnnotationsLists existingAnnotations = (AnnotationsLists)
112                                               annotationsByOffset.
113                                               get(startOffset);
114       if(existingAnnotations == null){
115         existingAnnotations = new AnnotationsLists();
116         annotationsByOffset.put(startOffset, existingAnnotations);
117       }
118       existingAnnotations.startingAnnotations.add(annotation);
119 
120       Long endOffset = annotation.getEndNode().getOffset();
121       existingAnnotations = (AnnotationsLists)annotationsByOffset.
122                                               get(endOffset);
123       if(existingAnnotations == null){
124         existingAnnotations = new AnnotationsLists();
125         annotationsByOffset.put(endOffset, existingAnnotations);
126       }
127       existingAnnotations.endingAnnotations.add(annotation);
128     }
129 
130     //parse through all the offsets
131     Iterator offsetsIter = annotationsByOffset.keySet().iterator();
132     while(offsetsIter.hasNext()){
133       fireDataAdvance((Long) offsetsIter.next());
134     }
135   }//public void execute() throws ExecutionException{
136 
137   /**
138    * Adds a new instance to the dataset being constructed.
139    * @param instance the instance value to be added.
140    */
141   public void addInstance(Instance instance){
142     dataSet.add(instance);
143   }
144 
145   public static void main(String[] args) {
146   }
147 
148 
149   public void setConfigFileURL(URL configFileURL) {
150     this.configFileURL = configFileURL;
151   }
152 
153   public URL getConfigFileURL() {
154     return configFileURL;
155   }
156 
157   public Resource init(){
158     readConfigFile();
159     //prepare the dataset
160     FastVector attributes = new FastVector();
161     Iterator attIter = attributeDetectors.iterator();
162     while(attIter.hasNext()){
163       attributes.addElement(((AttributeDetector)attIter.next()).getAttribute());
164     }
165     //add the attribute for the class
166     attributes.addElement(instanceDetector.getClassAttribute());
167     dataSet = new Instances(getName() + " Dataset", attributes, 0);
168 
169     return this;
170   }
171 
172   /**
173    * Reads the configuration file and populates internal data with values.
174    */
175   protected void readConfigFile(){
176     //hardcoded for now
177     AnnotationDetector annotationDetector = new AnnotationDetector();
178     annotationDetector.setAnnotationTypes("Date,Person,Location,Organization,Money");
179     setInstanceDetector(annotationDetector);
180     //Add attributes now
181 
182     //annotation length (in tokens)
183     addAttributeDetector(new AnnotationLengthExtractor());
184     //POS category and orthography for the first 7 tokens
185     for(int i = 1; i <= 7; i++){
186       POSCategoryExtractor posExtractor = new POSCategoryExtractor();
187       posExtractor.setPosition(i);
188       //look in the right context too
189 //      posExtractor.setIgnoreRightContext(false);
190       addAttributeDetector(posExtractor);
191 
192       TokenOrthographyExtractor orthExtractor = new TokenOrthographyExtractor();
193       orthExtractor.setPosition(i);
194       addAttributeDetector(orthExtractor);
195     }
196 
197 
198     //POS category and orthography for 3 tokens left context
199     for(int i = -1; i >= -3; i--){
200       POSCategoryExtractor posExtractor = new POSCategoryExtractor();
201       posExtractor.setPosition(i);
202       addAttributeDetector(posExtractor);
203 
204       TokenOrthographyExtractor orthExtractor = new TokenOrthographyExtractor();
205       orthExtractor.setPosition(i);
206       addAttributeDetector(orthExtractor);
207     }
208 
209     //Lookup type and position for the first 3 lookups
210     LookupDetector lookupDetector = new LookupDetector();
211     //type - 1
212     addAttributeDetector(lookupDetector);
213     //position - 1
214     addAttributeDetector(lookupDetector);
215     //type - 2
216     addAttributeDetector(lookupDetector);
217     //position - 2
218     addAttributeDetector(lookupDetector);
219     //type -3
220     addAttributeDetector(lookupDetector);
221     //position - 3
222     addAttributeDetector(lookupDetector);
223   }
224 
225   public void setState(int state) {
226     this.state = state;
227   }
228 
229   public int getState() {
230     return state;
231   }
232   public synchronized void removeDataListener(DataListener l) {
233     if (dataListeners != null && dataListeners.contains(l)) {
234       Vector v = (Vector) dataListeners.clone();
235       v.removeElement(l);
236       dataListeners = v;
237     }
238   }
239   public synchronized void addDataListener(DataListener l) {
240     Vector v = dataListeners == null ? new Vector(2) : (Vector) dataListeners.clone();
241     if (!v.contains(l)) {
242       v.addElement(l);
243       dataListeners = v;
244       l.setDataCollector(this);
245     }
246   }
247 
248   /**
249    * URL to the file containing the configuration.
250    */
251   protected URL configFileURL;
252 
253   /**
254    * The types of annotation to be considered. Annotations of types not
255    * contained here will be ignored.
256    */
257   List inputTypes;
258 
259   protected AnnotationSet annotationSet;
260   public Instances getDataSet(){
261     return dataSet;
262   }
263 
264   protected Instances dataSet;
265 
266   protected InstanceDetector instanceDetector;
267 
268   /**
269    * Stores the annotations from the input annotation set by offset (starting
270    * and ending). Maps from Long (offset) to {@link AnnotationsLists}.
271    */
272   protected SortedMap annotationsByOffset;
273 
274   /**
275    * A structure that stores the annotations relevant for an offset: a list of
276    * annotations that start at the offset and a list of annotations that end at
277    * the offset.
278    */
279   protected static class AnnotationsLists{
280     public AnnotationsLists(){
281       startingAnnotations = new HashSet();
282       endingAnnotations = new HashSet();
283     }
284 
285     public Set startingAnnotations;
286     public Set endingAnnotations;
287   }
288 
289   List attributeDetectors;
290 
291   /**
292    * The state of the data collector. Can be one of {@link BEFORE},
293    * {@link INSIDE} or {@link AFTER} according to the relation between the
294    * current location in the document and the instance being constructed.
295    * The value of the state is controlled by the instance detector.
296    */
297   protected int state;
298 
299   private transient Vector dataListeners;
300   private String annotationSetName;
301 
302   protected void fireDataAdvance(Long e) {
303     if (dataListeners != null) {
304       Vector listeners = dataListeners;
305       int count = listeners.size();
306       for (int i = 0; i < count; i++) {
307         ((DataListener) listeners.elementAt(i)).dataAdvance(e);
308       }
309     }
310   }
311 
312   public void addAttributeDetector(AttributeDetector attrDetector){
313     attributeDetectors.add(attrDetector);
314     attrDetector.setDataCollector(this);
315   }
316 
317   public List getAttributeDetectors(){
318     return attributeDetectors;
319   }
320 
321   public InstanceDetector getInstanceDetector() {
322     return instanceDetector;
323   }
324 
325   public void setInstanceDetector(InstanceDetector instanceDetector) {
326     if(instanceDetector != null) removeDataListener(instanceDetector);
327     this.instanceDetector = instanceDetector;
328     addDataListener(instanceDetector);
329   }
330   public void setAnnotationSetName(String annotationSetName) {
331     this.annotationSetName = annotationSetName;
332   }
333   public String getAnnotationSetName() {
334     return annotationSetName;
335   }
336 }