|
LookupDetector |
|
1 package gate.ml; 2 3 import java.util.*; 4 5 import weka.core.*; 6 7 8 import gate.*; 9 import gate.util.*; 10 import gate.creole.ANNIEConstants; 11 /** 12 * Detects lookup major and minor types and their location. 13 * This attribute detector is used to detect both lookup types (a nominal 14 * attribute) and their location (a numerical one). 15 * A sequence of calls to {@link #getAttribute()} will return alternatively 16 * the two types of attributes. 17 */ 18 public class LookupDetector extends AbstractAttributeExtractor{ 19 20 public LookupDetector() { 21 } 22 23 public Attribute getAttribute() { 24 Attribute attribute = null; 25 String attributeNameBase = "Lookup-" + ((int)attributesReturned / 2 + 1); 26 if(attributesReturned % 2 == 0){ 27 //even value -> Lookup type 28 FastVector values = new FastVector(LOOKUP_TYPES.length); 29 for(int i = 0; i < LOOKUP_TYPES.length; i++) 30 values.addElement(LOOKUP_TYPES[i]); 31 attribute = new Attribute(attributeNameBase, 32 values); 33 }else{ 34 //odd value ->lookup position 35 attribute = new Attribute(attributeNameBase + " (position)"); 36 } 37 attributesReturned++; 38 return attribute; 39 } 40 41 42 public Object getAttributeValue(Object data) { 43 if(data == lastAnnotationInstance){ 44 if(lastLookupPosition != -1){ 45 //this is a second question for the same annotation instance and the 46 //same lookup -> return lookup position 47 Object returnValue = lastLookupPosition == -2 ? null : 48 new Double(lastLookupPosition); 49 lastLookupPosition = -1; 50 return returnValue; 51 } 52 }else{ 53 //new annotation instance 54 lookupsReturned = 0; 55 } 56 57 //if we reached this point we need to return the lookup type 58 59 //the data is an annotation in this case. 60 Annotation ann = (Annotation)data; 61 Long endOffset = ann.getEndNode().getOffset(); 62 Long nextOffset = ann.getStartNode().getOffset(); 63 int skippedLookups = 0; 64 int skippedTokens = 0; 65 while(nextOffset != null && 66 nextOffset.compareTo(endOffset) < 0){ 67 //advance offset skipping all Lookups found until the one that needs 68 //returning 69 Set startingAnnots = dataCollector.getStartingAnnotations(nextOffset); 70 if(startingAnnots != null && (!startingAnnots.isEmpty())){ 71 //first count skipped tokens 72 Iterator annIter = startingAnnots.iterator(); 73 while(annIter.hasNext()){ 74 Annotation annotation = (Annotation)annIter.next(); 75 if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){ 76 skippedTokens++; 77 } 78 } 79 80 annIter = startingAnnots.iterator(); 81 while(annIter.hasNext()){ 82 Annotation annotation = (Annotation)annIter.next(); 83 if(annotation.getType().equals(ANNIEConstants.LOOKUP_ANNOTATION_TYPE)){ 84 skippedLookups++; 85 if(skippedLookups == (lookupsReturned + 1)){ 86 //the lookup we just skipped was never returned before 87 //it needs to be returned now 88 String lookupType = (String)annotation.getFeatures(). 89 get(ANNIEConstants.LOOKUP_MAJOR_TYPE_FEATURE_NAME); 90 String minorType = (String)annotation.getFeatures(). 91 get(ANNIEConstants.LOOKUP_MINOR_TYPE_FEATURE_NAME); 92 if(minorType != null) lookupType += ":" + minorType; 93 94 //save the last annotation instance we examined 95 lastAnnotationInstance = ann; 96 //save the location for the last lookup found 97 lastLookupPosition = skippedTokens; 98 lookupsReturned ++; 99 if(LOOKUP_TYPES_LIST.contains(lookupType)){ 100 return lookupType; 101 }else{ 102 Out.prln("Warning: unknown lookup type: " + lookupType); 103 return null; 104 } 105 } 106 } 107 } 108 } 109 nextOffset = dataCollector.nextOffset(nextOffset); 110 } 111 //no more lookups 112 lastLookupPosition = -2; 113 lastAnnotationInstance = ann; 114 return null; 115 } 116 117 118 /** 119 * This attribute detector is used to detect both lookup types (a nominal 120 * attribute) and their location (a numerical one). 121 * A sequence of calls to {@link #getAttribute()} will return alternatively 122 * the two types of attributes. 123 * This value is used to determine what attribute will be returned based on 124 * its parity. 125 */ 126 protected int attributesReturned = 0; 127 128 /** 129 * This attribute detector can be used repeatedly to get the values for more 130 * than one lookup annotations inside the annotation instance under scrutiny. 131 * This value will mark the number of lookups returned for the current target 132 * entity in order to avoid returning the same value twice. 133 */ 134 protected int lookupsReturned = 0; 135 136 protected Annotation lastAnnotationInstance = null; 137 138 protected int lastLookupPosition = -1; 139 140 protected static final String[] LOOKUP_TYPES; 141 protected static final List LOOKUP_TYPES_LIST; 142 143 static{ 144 LOOKUP_TYPES = new String[]{ 145 "sport", "stop", "organization", "location:city", "organization:company", 146 "location:country_abbrev", "country_adj", "location:country", 147 "currency_unit:pre_amount", "currency_unit:post_amount", "date_key", 148 "date_unit", "date:day", "organization:departmen", "facility_key_ext", 149 "facility_key", "facility:building", "date:festival", "govern_key", 150 "organization:government", "greeting", "time:hour", "ident_key:pre", 151 "jobtitle", "loc_general_key", "loc_key:post", "loc_key:pre", 152 "location:relig", "date:month", "location:region", "cdg", 153 "organization:newspaper", "number", "date:ordinal", "organization", 154 "org_base", "org_key:cap", "org_key", "org_pre", "spur", 155 "person_first:ambig", "person_ending", "person_first:female", "person_full", 156 "person_first:male", "person_full:relig", "person_full:sci", "phone_prefix", 157 "location:province", "location:racecourse", "spur_ident", "address:street", 158 "surname:prefix", "organization:team", "time:ampm", "time_modifier", 159 "time_unit", "time:zone", "title:female", "title:civilian", "title:male", 160 "title:military", "title:police", "organization:company", "year"}; 161 162 LOOKUP_TYPES_LIST = Arrays.asList(LOOKUP_TYPES); 163 } 164 165 }
|
LookupDetector |
|