1   package gate.creole;
2   
3   import gate.creole.gazetteer.*;
4   import java.util.*;
5   import gate.*;
6   import gate.creole.*;
7   import gate.util.*;
8   import java.io.*;
9   
10  public class GazetteerListsCollector extends AbstractLanguageAnalyser {
11    private static String PERSON_ANNOT_NAME = "PER";
12  
13    public void execute() throws gate.creole.ExecutionException {
14      //reinitialise the stats
15      statsPerType = new HashMap();
16  
17      //check the input
18      if(document == null) {
19        throw new ExecutionException(
20          "No document to process!"
21        );
22      }
23  
24      if (gazetteer == null) {
25        throw new ExecutionException(
26          "No gazetteer set!"
27        );
28      }
29  
30      //if no annotation types given, then exit
31      if ((this.annotationTypes == null) || annotationTypes.isEmpty()) {
32        Out.prln("Gazetteer Lists Collector Warning: No annotation types given for processing");
33        return;
34      }
35  
36      // get the annotations from document
37      if ((markupSetName == null)|| (markupSetName.equals("")))
38        allAnnots = document.getAnnotations();
39      else
40        allAnnots = document.getAnnotations(markupSetName);
41  
42      //if none found, print warning and exit
43      if ((allAnnots == null) || allAnnots.isEmpty()) {
44        Out.prln("Gazetteer Lists Collector Warning: No annotations found for processing");
45        return;
46      }
47  
48      //collect the stats for each annotation type
49      for (int i = 0; i < annotationTypes.size(); i++) {
50        AnnotationSet annots = allAnnots.get((String) annotationTypes.get(i));
51        if (annots == null || annots.isEmpty())
52          continue;
53        statsPerType.put(annotationTypes.get(i), new HashMap());
54        collectLists(annots, (String) annotationTypes.get(i));
55      }
56  
57      //print out the stats in log files
58      printStats();
59  
60      //save the updated gazetteer lists now
61      Map theLists = gazetteer.getLinearDefinition().getListsByNode();
62      Iterator iter1 = theLists.keySet().iterator();
63      while (iter1.hasNext()) {
64        GazetteerList theList = (GazetteerList) theLists.get(iter1.next());
65        try {
66          if (theList.isModified())
67            theList.store();
68        } catch (ResourceInstantiationException ex) {
69          throw new GateRuntimeException(ex.getMessage());
70        }
71      }
72  
73    }
74  
75    public void setMarkupASName(String newMarkupASName) {
76      markupSetName = newMarkupASName;
77    }
78  
79    public String  getMarkupASName() {
80      return markupSetName;
81    }
82  
83    /** get the types of the annotation*/
84    public List getAnnotationTypes() {
85      return annotationTypes;
86    }//getAnnotationTypes
87  
88    /** set the types of the annotations*/
89    public void setAnnotationTypes(List newType) {
90      annotationTypes = newType;
91    }//setAnnotationTypes
92  
93    public Gazetteer getGazetteer() {
94      return gazetteer;
95    }
96  
97    public void setGazetteer(Gazetteer theGaz) {
98      gazetteer = theGaz;
99    }
100 
101   public void setTheLanguage(String language) {
102     theLanguage = language;
103   }
104 
105   public String  getTheLanguage() {
106     return theLanguage;
107   }
108 
109   protected void collectLists(AnnotationSet annots, String annotType) {
110     Iterator iter = annots.iterator();
111     String listName = "";
112     GazetteerList theList = null;
113     Iterator theListsIter =
114       gazetteer.getLinearDefinition().getListsByNode().values().iterator();
115     while (theListsIter.hasNext() && listName.equals("")) {
116       theList = (GazetteerList) theListsIter.next();
117       if (theList.getURL().toExternalForm().endsWith(annotType + ".lst"))
118         listName = theList.getURL().toExternalForm();
119     }
120     while (iter.hasNext()) {
121       Annotation annot = (Annotation) iter.next();
122       String text = "";
123       List strings = new ArrayList();
124       try {
125         text = document.getContent().getContent(
126           annot.getStartNode().getOffset(),
127           annot.getEndNode().getOffset()
128         ).toString();
129         //tokenise the text and save for the future if we need it
130         StringTokenizer tok = new StringTokenizer(text, "\n\r.|();-?!\t", false);
131         while (tok.hasMoreTokens())
132           strings.add(tok.nextToken());
133         //then replace the line breaks with spaces for the gazetteer
134         text = text.replace('\r', ' ');
135         text = text.replace('\n', ' ');
136         text = text.replace('\t', ' ');
137 
138       } catch (InvalidOffsetException ex) {
139         throw new GateRuntimeException(ex.getMessage());
140       }
141 
142       //collect stats for the string
143       if (((HashMap) statsPerType.get(annotType)).containsKey(text))
144         ((HashMap) statsPerType.get(annotType)).put(text,
145             new Integer(((Integer)
146               ((HashMap) statsPerType.get(annotType)).get(text)).intValue()+1));
147       else
148         ((HashMap) statsPerType.get(annotType)).put(text, new Integer(1));
149 
150       //also collect stats for the individual tokens in the name to identify the most
151       //frequent tokens across names
152       if (strings.size() > 1) {
153         for (int i=0; i < strings.size(); i++) {
154           String theString = (String) strings.get(i);
155           //collect stats for the string
156           if ( ( (HashMap) statsPerType.get(annotType)).containsKey(theString))
157             ( (HashMap) statsPerType.get(annotType)).put(theString,
158                 new Integer( ( (Integer)
159                               ( (HashMap) statsPerType.get(annotType)).get(
160                 theString)).intValue() + 1));
161           else
162             ( (HashMap) statsPerType.get(annotType)).put(theString,
163                 new Integer(1));
164         }
165       }
166 
167       //first we check whether the text is already in the gazetteer
168       Set lookupResult = gazetteer.lookup(text);
169       if (lookupResult != null && lookupResult.size() > 0)
170         continue;
171       //if not, then we add it
172       gazetteer.add(text,
173         new Lookup(listName, annotType, "inferred", theLanguage));
174 //      theList.add(text + document.getSourceUrl().toString());
175       theList.add(text);
176 
177 
178       //for persons we want also to add their individual names to the list
179       if (annotType.equals(PERSON_ANNOT_NAME) && strings.size() > 1) {
180         for (int i=0; i < strings.size(); i++) {
181           String theString = (String) strings.get(i);
182           Set lookupResult1 = gazetteer.lookup(theString);
183           if (lookupResult1 != null && lookupResult1.size() > 0)
184             continue;
185           if (theString.length() < 3)
186             continue;
187           gazetteer.add(theString,
188             new Lookup(listName, annotType, "inferred", theLanguage));
189           theList.add(theString);
190         }
191       }
192     }
193   }
194 
195   protected void printStats() {
196     try {
197       for (int i=0; i < annotationTypes.size(); i++) {
198         if (! statsPerType.containsKey(annotationTypes.get(i)))
199           continue;
200         BufferedWriter writer = new BufferedWriter(
201           new OutputStreamWriter(new FileOutputStream(
202            annotationTypes.get(i) + ".stats.lst"),
203           "UTF-8"));
204         HashMap stats = (HashMap) statsPerType.get(annotationTypes.get(i));
205         Iterator stringsIter = stats.keySet().iterator();
206         while (stringsIter.hasNext()) {
207           String string = (String) stringsIter.next();
208           writer.write(string);
209           writer.write("$");
210           writer.write( ((Integer)stats.get(string)).toString());
211           writer.newLine();
212         }
213         writer.close();
214       }
215   } catch(IOException ioe){
216       throw new RuntimeException(ioe.getMessage());
217   }//try
218 
219   }
220 
221   /**
222    * The idea is to have this method check if an item
223    * is already present in the gazetteer under this type,
224    * and if so, not to add it. It is not implemented for now.
225    */
226   protected boolean alreadyPresentInGazetteer(String token) {
227     return false;
228   }
229 
230   private String markupSetName = "";
231   private AnnotationSet allAnnots;
232   private List annotationTypes;
233   private Gazetteer gazetteer;
234   private String theLanguage = "";
235   private HashMap statsPerType = new HashMap();
236 }