1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan 28 May 2002
10   *
11   *  $Id: POSCategoryExtractor.java,v 1.1 2002/06/27 17:12:32 valyt Exp $
12   */
13  package gate.ml;
14  
15  import java.util.*;
16  
17  import weka.core.*;
18  
19  
20  import gate.*;
21  import gate.util.*;
22  import gate.creole.ANNIEConstants;
23  /**
24   * Extracts the POS category of the n-th word inside the instance.
25   */
26  public class POSCategoryExtractor extends AbstractAttributeExtractor {
27  
28    public Attribute getAttribute() {
29      List posCats = Arrays.asList(POS_CATEGORIES);
30      FastVector values = new FastVector(POS_CATEGORIES.length);
31      for(int i = 0; i < POS_CATEGORIES.length; i++)
32        values.addElement(POS_CATEGORIES[i]);
33      Attribute attribute = new Attribute("POS(" + position + ")", values);
34      return attribute;
35    }
36  
37    public Object getAttributeValue(Object data){
38      if(position > 0) return getInsidePOSValue(data);
39      else return getLeftContextPOS(data);
40    }
41  
42    /**
43     * This method will find POS category for tokens in the left context of the
44     * target annotation (where position is negative).
45     * @param data
46     * @return
47     */
48    protected Object getLeftContextPOS(Object data){
49      //the data is an annotation in this case.
50      Annotation ann = (Annotation)data;
51      Long previousOffset = dataCollector.previousOffset(
52                                          ann.getStartNode().getOffset());
53      //we start looking for Tokens going backwards from the annotation start.
54      int skippedTokens = 0;
55      while(previousOffset != null &&
56            skippedTokens < -position){
57        Set startingAnnots = dataCollector.getStartingAnnotations(previousOffset);
58        if(startingAnnots != null && (!startingAnnots.isEmpty())){
59          Iterator annIter = startingAnnots.iterator();
60          while(annIter.hasNext()){
61            Annotation annotation = (Annotation)annIter.next();
62            if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){
63              skippedTokens++;
64              if(skippedTokens == -position){
65                //the token we just skipped was the one we needed
66                if(annotation.getFeatures() != null){
67                  String pos = (String)annotation.getFeatures().
68                               get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
69                  if(posValues.contains(pos)) return pos;
70                  else{
71                    Out.prln("Warning: unknown POS category: " + pos);
72                  }
73                }
74                return null;
75              }
76            }
77          }
78        }
79        previousOffset = dataCollector.previousOffset(previousOffset);
80      }
81      //could not find the token
82      return null;
83    }
84  
85    /**
86     * This method will find the POS category for tokens covered by the instance
87     * annotation and tokens that are part of the right context.
88     * @param data the instance annotation
89     * @return the POS category as a string.
90     */
91    protected Object getInsidePOSValue(Object data){
92      //the data is an annotation in this case.
93      Annotation ann = (Annotation)data;
94      Long endOffset = ann.getEndNode().getOffset();
95      Long nextOffset = ann.getStartNode().getOffset();
96      int skippedTokens = 0;
97      while(nextOffset != null &&
98            ((!ignoreRightContext) || (nextOffset.compareTo(endOffset) < 0)) &&
99            skippedTokens < position){
100       //advance offset skipping all tokens found
101       Set startingAnnots = dataCollector.getStartingAnnotations(nextOffset);
102       if(startingAnnots != null && (!startingAnnots.isEmpty())){
103         Iterator annIter = startingAnnots.iterator();
104         while(annIter.hasNext()){
105           Annotation annotation = (Annotation)annIter.next();
106           if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){
107             skippedTokens++;
108             if(skippedTokens == position){
109               //the token we just skipped was the one we needed
110               if(annotation.getFeatures() != null){
111                 String pos = (String)annotation.getFeatures().
112                              get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
113                 if(posValues.contains(pos)) return pos;
114                 else{
115                   Out.prln("Warning: unknown POS category: " + pos);
116                 }
117               }
118               return null;
119             }
120           }
121         }
122       }
123       nextOffset = dataCollector.nextOffset(nextOffset);
124     }
125     //could not find the token
126     return null;
127   }
128 
129   /**
130    * Sets the (1-based) location of the word inside the instance that this
131    * extractor targets.
132    * Negative positions mean tokens in the right context.
133    * Position cannot be zero!
134    * @param position an int value.
135    */
136   public void setPosition(int position){
137     this.position = position;
138   }
139 
140   public void setIgnoreRightContext(boolean ignoreRightContext) {
141     this.ignoreRightContext = ignoreRightContext;
142   }
143 
144   public boolean isIgnoreRightContext() {
145     return ignoreRightContext;
146   }
147 
148   /**
149    * The 1-based position of the Token (for which the POS will gbe extracted)
150    * inside the instance annotation.
151    */
152   protected int position;
153 
154   /**
155    * Used internally for easy element-of tests
156    */
157   private List posValues = Arrays.asList(POS_CATEGORIES);
158 
159   static protected final String[] POS_CATEGORIES = new String[]
160         {"NN", "NNP", "NNPS", "NNS", "NP", "NPS", "JJ", "JJR", "JJS",
161          "JJSS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN",
162          "VBP", "VBZ", "FW", "CD", "CC", "DT", "EX", "IN", "LS",
163          "MD", "PDT", "POS", "PP", "PRP", "PRP$", "PRPR$", "RP",
164          "TO", "UH", "WDT", "WP", "WP$", "WRB", "SYM", "\"", "#",
165          "$", "'", "(", ")", ",", "--", "-LRB-", ".", "''", ":" ,"::", "`"};
166 
167   private boolean ignoreRightContext = true;
168 }