/*
* Copyright (c) 1998-2005, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Valentin Tablan 19/11/2002
*
* $Id: DatasetDefintion.java 6974, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
*
*/
package gate.learning;
import gate.util.GateException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.jdom.Content;
import org.jdom.Element;
/**
* Stores data described in the DATASET element of configuration file.
*/
public class DataSetDefinition {
/** List of ATTRIBUTE type features. */
protected java.util.List<Attribute>attributes;
/** ATTRIBUTE for the class label. */
protected Attribute classAttribute = null;
/** Instance type. */
protected String instanceType;
/** Class attibute index in the attribute list. */
protected int classIndex;
/** List of Ngram type features. */
protected java.util.List ngrams;
/** Data set for relation learning or others. */
public short dataType;
/** Chunking learning data type. */
public final static short ChunkLearningData = 1;
/** Text classification data type. */
public final static short ClassificationData = 2;
/** Relation learning data type. */
public final static short RelationData = 3;
/** The arrays and variables for fast computations. */
ArraysDataSetDefinition arrs;
// The variables for relation extraction
/** The feature in instance for the first argument of relation. */
String arg1Feat;
/** The feature in instance for the second argument of relation. */
String arg2Feat;
/** The first argument of relation. */
ArgOfRelation arg1 = null;
/** The second argument of relation. */
ArgOfRelation arg2 = null;
/** List of ATTRIBUTE_REL type features. */
protected java.util.List relAttributes;
/** Is the same window size for all NLP features. */
public boolean isSameWinSize=false;
/** The common window size. */
public int windowSizeLeft = 1;
public int windowSizeRight = 1;
/** value type of the ngram feature, 1--binary, 2--tf, 3--tf*idf. */
public int valueTypeNgram =3;
/**
* Constructor A DataSetDefinition is built using an XML element in
* configuration file.
*/
public DataSetDefinition(Element domElement) throws GateException {
if(!domElement.getName().equals("DATASET"))
throw new GateException("Dataset defintion element is \""
+ domElement.getName() + "\" instead of \"DATASET\"!");
// find instance the type
Element anElement = domElement.getChild("INSTANCE-TYPE");
if(anElement != null)
instanceType = anElement.getTextTrim();
else throw new GateException(
"Required element \"INSTANCE-TYPE\" not present!");
//Check if use the same window size to speed up preprocessing
windowSizeLeft = 1;
windowSizeRight = 1;
anElement = domElement.getChild("WINDOWSIZE");
if(anElement != null) {
isSameWinSize = true;
String value;
value = anElement.getAttributeValue("windowSizeLeft");
if(value!= null)
windowSizeLeft = Integer.parseInt(value);
value = anElement.getAttributeValue("windowSizeRight");
if(value != null)
windowSizeRight = Integer.parseInt(value);
} else {
isSameWinSize = false;
}
// Check if specify the value type of ngram feature
valueTypeNgram = 3;
anElement = domElement.getChild("ValueTypeNgram");
if(anElement != null) {
valueTypeNgram = Integer.parseInt(anElement.getTextTrim());
}
// Check the dataset definition file is for relation extraction or
// not
anElement = domElement.getChild("INSTANCE-ARG1");
if(anElement != null) { //
dataType = RelationData;
arg1Feat = anElement.getTextTrim();
anElement = domElement.getChild("INSTANCE-ARG2");
if(anElement != null)
arg2Feat = anElement.getTextTrim();
else throw new GateException(
"Required element \"INSTANCE-ARG2\" not present!");
// Get the features associated with arg1
anElement = domElement.getChild("FEATURES-ARG1");
if(anElement != null) {// Features for the first argument.
arg1 = new ArgOfRelation();
Element element1 = anElement.getChild("ARG");
if(element1 != null) {
arg1.type = element1.getChild("TYPE").getTextTrim();
arg1.feat = element1.getChild("FEATURE").getTextTrim();
} else throw new GateException(
"Required element \"ARG\" in \"FEATURES-ARG1\" not present!");
// Find the attribute features of the argument
obtainArgumentFeatures(anElement, arg1);
// Put the type and feat of data types into some arrays for fast
// computation
arg1.arrs = new ArraysDataSetDefinition();
arg1.arrs.putTypeAndFeatIntoArray(arg1.attributes);
arg1.arrs.numNgrams = arg1.ngrams.size();
// Get the maximal posistion
arg1.maxTotalPosition = obtainMaxTotalPosisiont(arg1);
}
// Get the features associated with arg2
anElement = domElement.getChild("FEATURES-ARG2");
if(anElement != null) {// Features for the first argument.
arg2 = new ArgOfRelation();
Element element1 = anElement.getChild("ARG");
if(element1 != null) {
arg2.type = element1.getChild("TYPE").getTextTrim();
arg2.feat = element1.getChild("FEATURE").getTextTrim();
} else throw new GateException(
"Required element \"ARG\" in \"FEATURES-ARG1\" not present!");
// Find the attribute features of the argument
obtainArgumentFeatures(anElement, arg2);
// Put the type and feat of data types into some arrays for fast
// computation
arg2.arrs = new ArraysDataSetDefinition();
arg2.arrs.putTypeAndFeatIntoArray(arg2.attributes);
arg2.arrs.numNgrams = arg2.ngrams.size();
// Get the maximal posistion
arg2.maxTotalPosition = obtainMaxTotalPosisiont(arg2);
}
// find the relation attributes
int attrIndex = 0;
relAttributes = new ArrayList();
Iterator childrenIter = domElement.getChildren("ATTRIBUTE_REL")
.iterator();
while(childrenIter.hasNext()) {
Element child = (Element)childrenIter.next();
AttributeRelation relAttribute = new AttributeRelation(child);
if(relAttribute.isClass()) {
if(classAttribute != null)
throw new GateException(
"RelAttribute \""
+ relAttribute.getName()
+ "\" marked as class attribute but the class is already known to be\""
+ classAttribute.getName() + "\"!");
classAttribute = relAttribute;
classIndex = attrIndex;
}
relAttributes.add(relAttribute);
attrIndex++;
}
arrs = new ArraysDataSetDefinition();
arrs.putTypeAndFeatIntoArray(relAttributes);
// get the args for the relation attribute terms
arrs.obtainArgs(relAttributes);
} else {// for other types of learning
dataType = ChunkLearningData;
// find the attributes
int attrIndex = 0;
attributes = new ArrayList<Attribute>();
Iterator childrenIter = domElement.getChildren("ATTRIBUTE").iterator();
while(childrenIter.hasNext()) {
Element child = (Element)childrenIter.next();
Attribute attribute = new Attribute(child);
if(attribute.isClass()) {
if(classAttribute != null)
throw new GateException(
"Attribute \""
+ attribute.getName()
+ "\" marked as class attribute but the class is already known to be\""
+ classAttribute.getName() + "\"!");
classAttribute = attribute;
classIndex = attrIndex;
}
attributes.add(attribute);
attrIndex++;
}
Iterator childrenSerieIter = domElement.getChildren("ATTRIBUTELIST")
.iterator();
while(childrenSerieIter.hasNext()) {
Element child = (Element)childrenSerieIter.next();
if(isSameWinSize) {
anElement = child.getChild("RANGE");
if(anElement == null) {
Element rangeElement = new Element("RANGE");
rangeElement.setAttribute("from", new Integer(this.windowSizeLeft*(-1)).toString());
rangeElement.setAttribute("to", new Integer(this.windowSizeRight).toString());
child.addContent(rangeElement);
} else {
anElement.setAttribute("from", new Integer(this.windowSizeLeft*(-1)).toString());
anElement.setAttribute("to", new Integer(this.windowSizeRight).toString());
}
}
List<Attribute>attributelist = Attribute.parseSerie(child);
/*if(isSameWinSize) {
//if(attributelist.size()>0) {
//Attribute att0 = (Attribute)attributelist.get(0);
//att0.position=0;
//attributes.add(att0);
//++attrIndex;
//}
for(int i=0; i<attributelist.size(); ++i) {
Attribute att0 = (Attribute)attributelist.get(i);
if(att0.position == 0) {
attributes.add(att0);
++attrIndex;
break;
}
}
} else {
//attributes.addAll(attributelist);
//attrIndex += attributelist.size();
}*/
attributes.addAll(attributelist);
attrIndex += attributelist.size();
}
if(classAttribute == null)
System.out.println("!! Warning: No class attribute defined! You CANNOT learn, but it's OK for producing the feature files.");
// find the Ngrams
ngrams = new ArrayList();
childrenIter = domElement.getChildren("NGRAM").iterator();
while(childrenIter.hasNext()) {
Element child = (Element)childrenIter.next();
Ngram ngram = new Ngram(child);
ngrams.add(ngram);
}
arrs = new ArraysDataSetDefinition();
arrs.putTypeAndFeatIntoArray(attributes);
arrs.numNgrams = ngrams.size();
}
if(LogService.minVerbosityLevel > 1)
System.out.println("*** dataType=" + dataType + " classType="
+ arrs.classType + " classFeat=" + arrs.classFeature);
}
int obtainMaxTotalPosisiont(ArgOfRelation arg1) {
int maxP = 0;
int maxN = 0;
for(int i = 0; i < arg1.attributes.size(); ++i) {
if(((Attribute)arg1.attributes.get(i)).position > maxP)
maxP = ((Attribute)arg1.attributes.get(i)).position;
else if(((Attribute)arg1.attributes.get(i)).position < maxN)
maxN = ((Attribute)arg1.attributes.get(i)).position;
}
for(int i = 0; i < arg1.ngrams.size(); ++i) {
if(((Ngram)arg1.ngrams.get(i)).position > maxP)
maxP = ((Ngram)arg1.ngrams.get(i)).position;
else if(((Ngram)arg1.ngrams.get(i)).position < maxN)
maxN = ((Ngram)arg1.ngrams.get(i)).position;
}
// Minus for maxN because it's a negative number.
return maxP - maxN;
}
/** Obtain the ATTRIBUTEs and other features of one argument. */
private int obtainArgumentFeatures(Element domElement, ArgOfRelation argRel)
throws GateException {
int attrIndex = 0;
argRel.attributes = new ArrayList();
Iterator childrenIter = domElement.getChildren("ATTRIBUTE").iterator();
while(childrenIter.hasNext()) {
Element child = (Element)childrenIter.next();
Attribute attribute = new Attribute(child);
argRel.attributes.add(attribute);
attrIndex++;
}
Iterator childrenSerieIter = domElement.getChildren("ATTRIBUTELIST")
.iterator();
while(childrenSerieIter.hasNext()) {
Element child = (Element)childrenSerieIter.next();
List attributelist = Attribute.parseSerie(child);
argRel.attributes.addAll(attributelist);
attrIndex += attributelist.size();
}
// find the Ngrams
argRel.ngrams = new ArrayList();
childrenIter = domElement.getChildren("NGRAM").iterator();
while(childrenIter.hasNext()) {
Element child = (Element)childrenIter.next();
Ngram ngram = new Ngram(child);
argRel.ngrams.add(ngram);
}
return attrIndex;
}
public String toString() {
StringBuffer res = new StringBuffer();
res.append("Instance type: " + instanceType + "\n");
Iterator attrIter = attributes.iterator();
while(attrIter.hasNext()) {
res.append("Attribute:" + attrIter.next().toString() + "\n");
}
res.append("Ngrams\n");
attrIter = ngrams.iterator();
while(attrIter.hasNext()) {
res.append("Ngram:" + attrIter.next().toString() + "\n");
}
return res.toString();
}
public java.util.List getAttributes() {
return attributes;
}
public Attribute getClassAttribute() {
return classAttribute;
}
public String getInstanceType() {
return instanceType;
}
public int getClassIndex() {
return classIndex;
}
public java.util.List getNgrams() {
return ngrams;
}
public void setClassAttribute(Attribute classAttribute) {
this.classAttribute = classAttribute;
}
public void setClassIndex(int classIndex) {
this.classIndex = classIndex;
}
public void setInstanceType(String instanceType) {
this.instanceType = instanceType;
}
}