|
GazetteerListsCollector |
|
1 package gate.creole; 2 3 import gate.creole.gazetteer.*; 4 import java.util.*; 5 import gate.*; 6 import gate.creole.*; 7 import gate.util.*; 8 import java.io.*; 9 10 public class GazetteerListsCollector extends AbstractLanguageAnalyser { 11 private static String PERSON_ANNOT_NAME = "PER"; 12 13 public void execute() throws gate.creole.ExecutionException { 14 //reinitialise the stats 15 statsPerType = new HashMap(); 16 17 //check the input 18 if(document == null) { 19 throw new ExecutionException( 20 "No document to process!" 21 ); 22 } 23 24 if (gazetteer == null) { 25 throw new ExecutionException( 26 "No gazetteer set!" 27 ); 28 } 29 30 //if no annotation types given, then exit 31 if ((this.annotationTypes == null) || annotationTypes.isEmpty()) { 32 Out.prln("Gazetteer Lists Collector Warning: No annotation types given for processing"); 33 return; 34 } 35 36 // get the annotations from document 37 if ((markupSetName == null)|| (markupSetName.equals(""))) 38 allAnnots = document.getAnnotations(); 39 else 40 allAnnots = document.getAnnotations(markupSetName); 41 42 //if none found, print warning and exit 43 if ((allAnnots == null) || allAnnots.isEmpty()) { 44 Out.prln("Gazetteer Lists Collector Warning: No annotations found for processing"); 45 return; 46 } 47 48 //collect the stats for each annotation type 49 for (int i = 0; i < annotationTypes.size(); i++) { 50 AnnotationSet annots = allAnnots.get((String) annotationTypes.get(i)); 51 if (annots == null || annots.isEmpty()) 52 continue; 53 statsPerType.put(annotationTypes.get(i), new HashMap()); 54 collectLists(annots, (String) annotationTypes.get(i)); 55 } 56 57 //print out the stats in log files 58 printStats(); 59 60 //save the updated gazetteer lists now 61 Map theLists = gazetteer.getLinearDefinition().getListsByNode(); 62 Iterator iter1 = theLists.keySet().iterator(); 63 while (iter1.hasNext()) { 64 GazetteerList theList = (GazetteerList) theLists.get(iter1.next()); 65 try { 66 if (theList.isModified()) 67 theList.store(); 68 } catch (ResourceInstantiationException ex) { 69 throw new GateRuntimeException(ex.getMessage()); 70 } 71 } 72 73 } 74 75 public void setMarkupASName(String newMarkupASName) { 76 markupSetName = newMarkupASName; 77 } 78 79 public String getMarkupASName() { 80 return markupSetName; 81 } 82 83 /** get the types of the annotation*/ 84 public List getAnnotationTypes() { 85 return annotationTypes; 86 }//getAnnotationTypes 87 88 /** set the types of the annotations*/ 89 public void setAnnotationTypes(List newType) { 90 annotationTypes = newType; 91 }//setAnnotationTypes 92 93 public Gazetteer getGazetteer() { 94 return gazetteer; 95 } 96 97 public void setGazetteer(Gazetteer theGaz) { 98 gazetteer = theGaz; 99 } 100 101 public void setTheLanguage(String language) { 102 theLanguage = language; 103 } 104 105 public String getTheLanguage() { 106 return theLanguage; 107 } 108 109 protected void collectLists(AnnotationSet annots, String annotType) { 110 Iterator iter = annots.iterator(); 111 String listName = ""; 112 GazetteerList theList = null; 113 Iterator theListsIter = 114 gazetteer.getLinearDefinition().getListsByNode().values().iterator(); 115 while (theListsIter.hasNext() && listName.equals("")) { 116 theList = (GazetteerList) theListsIter.next(); 117 if (theList.getURL().toExternalForm().endsWith(annotType + ".lst")) 118 listName = theList.getURL().toExternalForm(); 119 } 120 while (iter.hasNext()) { 121 Annotation annot = (Annotation) iter.next(); 122 String text = ""; 123 List strings = new ArrayList(); 124 try { 125 text = document.getContent().getContent( 126 annot.getStartNode().getOffset(), 127 annot.getEndNode().getOffset() 128 ).toString(); 129 //tokenise the text and save for the future if we need it 130 StringTokenizer tok = new StringTokenizer(text, "\n\r.|();-?!\t", false); 131 while (tok.hasMoreTokens()) 132 strings.add(tok.nextToken()); 133 //then replace the line breaks with spaces for the gazetteer 134 text = text.replace('\r', ' '); 135 text = text.replace('\n', ' '); 136 text = text.replace('\t', ' '); 137 138 } catch (InvalidOffsetException ex) { 139 throw new GateRuntimeException(ex.getMessage()); 140 } 141 142 //collect stats for the string 143 if (((HashMap) statsPerType.get(annotType)).containsKey(text)) 144 ((HashMap) statsPerType.get(annotType)).put(text, 145 new Integer(((Integer) 146 ((HashMap) statsPerType.get(annotType)).get(text)).intValue()+1)); 147 else 148 ((HashMap) statsPerType.get(annotType)).put(text, new Integer(1)); 149 150 //also collect stats for the individual tokens in the name to identify the most 151 //frequent tokens across names 152 if (strings.size() > 1) { 153 for (int i=0; i < strings.size(); i++) { 154 String theString = (String) strings.get(i); 155 //collect stats for the string 156 if ( ( (HashMap) statsPerType.get(annotType)).containsKey(theString)) 157 ( (HashMap) statsPerType.get(annotType)).put(theString, 158 new Integer( ( (Integer) 159 ( (HashMap) statsPerType.get(annotType)).get( 160 theString)).intValue() + 1)); 161 else 162 ( (HashMap) statsPerType.get(annotType)).put(theString, 163 new Integer(1)); 164 } 165 } 166 167 //first we check whether the text is already in the gazetteer 168 Set lookupResult = gazetteer.lookup(text); 169 if (lookupResult != null && lookupResult.size() > 0) 170 continue; 171 //if not, then we add it 172 gazetteer.add(text, 173 new Lookup(listName, annotType, "inferred", theLanguage)); 174 // theList.add(text + document.getSourceUrl().toString()); 175 theList.add(text); 176 177 178 //for persons we want also to add their individual names to the list 179 if (annotType.equals(PERSON_ANNOT_NAME) && strings.size() > 1) { 180 for (int i=0; i < strings.size(); i++) { 181 String theString = (String) strings.get(i); 182 Set lookupResult1 = gazetteer.lookup(theString); 183 if (lookupResult1 != null && lookupResult1.size() > 0) 184 continue; 185 if (theString.length() < 3) 186 continue; 187 gazetteer.add(theString, 188 new Lookup(listName, annotType, "inferred", theLanguage)); 189 theList.add(theString); 190 } 191 } 192 } 193 } 194 195 protected void printStats() { 196 try { 197 for (int i=0; i < annotationTypes.size(); i++) { 198 if (! statsPerType.containsKey(annotationTypes.get(i))) 199 continue; 200 BufferedWriter writer = new BufferedWriter( 201 new OutputStreamWriter(new FileOutputStream( 202 annotationTypes.get(i) + ".stats.lst"), 203 "UTF-8")); 204 HashMap stats = (HashMap) statsPerType.get(annotationTypes.get(i)); 205 Iterator stringsIter = stats.keySet().iterator(); 206 while (stringsIter.hasNext()) { 207 String string = (String) stringsIter.next(); 208 writer.write(string); 209 writer.write("$"); 210 writer.write( ((Integer)stats.get(string)).toString()); 211 writer.newLine(); 212 } 213 writer.close(); 214 } 215 } catch(IOException ioe){ 216 throw new RuntimeException(ioe.getMessage()); 217 }//try 218 219 } 220 221 /** 222 * The idea is to have this method check if an item 223 * is already present in the gazetteer under this type, 224 * and if so, not to add it. It is not implemented for now. 225 */ 226 protected boolean alreadyPresentInGazetteer(String token) { 227 return false; 228 } 229 230 private String markupSetName = ""; 231 private AnnotationSet allAnnots; 232 private List annotationTypes; 233 private Gazetteer gazetteer; 234 private String theLanguage = ""; 235 private HashMap statsPerType = new HashMap(); 236 }
|
GazetteerListsCollector |
|