1   /*
2    *  EntityDescriptor.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Valentin Tablan, July/2000
12   *
13   *  $Id: EntityDescriptor.java,v 1.5 2001/09/26 11:41:05 marin Exp $
14   */
15  
16  package gate.creole.nerc;
17  
18  import gate.*;
19  import gate.util.*;
20  
21  import java.io.Serializable;
22  
23  /** Represents a single named entity */
24  public class EntityDescriptor implements Serializable{
25  
26    /** Constructs a new entity descriptor */
27    public EntityDescriptor(String string, String category, int start, int end) {
28      this.string = normaliseString(string);
29      this.category = category;
30      offsets = new int[2];
31      offsets[0] = start;
32      offsets[1] = end;
33    }
34  
35    /** Constructs a new entity descriptor starting from a Gate annotation */
36    public EntityDescriptor(Document document, Annotation annotation) {
37      offsets = new int[2];
38      offsets[0] = annotation.getStartNode().getOffset().intValue();
39      offsets[1] = annotation.getEndNode().getOffset().intValue();
40      try{
41        string = normaliseString(document.getContent().getContent(
42                                      annotation.getStartNode().getOffset(),
43                                      annotation.getEndNode().getOffset()).
44                                      toString());
45      } catch(InvalidOffsetException ioe){
46        ioe.printStackTrace();
47      }
48      category = annotation.getType();
49    }
50  
51    /** Returns a normalised string for the entity. This is the string from the
52      * text document the entity was descovered in, with all whitespace sequences
53      * replaced by a single space character
54      */
55    public String getString(){
56      return string;
57    }
58  
59    /** Returns the category of the entity*/
60    public String getCategory(){
61      return category;
62    }
63  
64    /** Returns a pair of integers specifying the character offsets in the
65      * original file where the entity occured
66      */
67    public int[] getOffsets(){
68      return offsets;
69    }
70  
71    /** Returns a string giving the category, offsets and normalised string for
72      * the entity, with no newlines.
73      */
74    public String toString(){
75      return category + " " + offsets[0] + " " + offsets[1] + " " + string;
76    }
77  
78    String string;
79    String category;
80    int[] offsets;
81  
82    /** Normalises a string. That is removes all the leading and trailing
83      * whitespace characters and replaces all inner whitespace sequences with a
84      * single space character
85      */
86    protected String normaliseString(String text){
87  ///    String res = "";
88      StringBuffer res = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
89      if(text == null) return null;
90      int charIdx = 0;
91      boolean lastWasSpace = false;
92      //skip the leading spaces
93      while(charIdx < text.length() &&
94            Character.isWhitespace(text.charAt(charIdx))) charIdx++;
95      //parse the rest of the text
96      while(charIdx < text.length()){
97        if(Character.isWhitespace(text.charAt(charIdx))){
98          //reading spaces
99          lastWasSpace = true;
100       }else{
101         //reading non-spaces
102         if(lastWasSpace) ///res += " ";
103                 res.append(" ");
104 ///        res += text.charAt(charIdx);
105         res.append(text.charAt(charIdx));
106         lastWasSpace = false;
107       }
108       charIdx++;
109     }//while(charIdx < text.length())
110     return res.toString();
111   }
112 
113 }
114