/* * NLPFeaturesOfDoc.java * * Yaoyong Li 22/03/2007 * * $Id: NLPFeaturesOfDoc.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $ */ package gate.learning; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import gate.Annotation; import gate.AnnotationSet; import gate.FeatureMap; import gate.util.OffsetComparator; import gate.learning.Ngram; /* * Obtain the NLP (linguistic) features from the GATE annotations of one * document. */ public class NLPFeaturesOfDoc { /** One component stores all the features for one instance. */ StringBuffer[] featuresInLine; /** Feature names. */ StringBuffer featuresName; /** Document id. */ private String docId = null; /** Number of instances in the document. */ int numInstances = 0; /** Total number of GATE types of NLP features. */ int totalnumTypes = 0; /** Number of features counted for each instance */ int[] featuresCounted; /** store the class name for each instances in. */ String[] classNames; /** Constructor with no parameters. */ public NLPFeaturesOfDoc() { } /** * Constructor, obtain NLP features from GATE annotations for each instance in * the document. */ public NLPFeaturesOfDoc(AnnotationSet annotations, String instanceType, String docName) { // Number of instances (tokens) in the document numInstances = annotations.get(instanceType).size(); featuresInLine = new StringBuffer[numInstances]; featuresName = new StringBuffer(); totalnumTypes = 0; featuresCounted = new int[numInstances]; classNames = new String[numInstances]; docId = docName; } /** * Entry method for getting the NLP features according to the specifications * in the dataset defintion files. */ public void obtainDocNLPFeatures(AnnotationSet annotations, DataSetDefinition dsd) { if(dsd.dataType == DataSetDefinition.RelationData) { // get the features for the relation data int initialPosition = 0; if(dsd.arg1 != null) { ArgOfRelation arg = dsd.arg1; boolean[][] isArgInRel = matchArgInstanceWithInst(annotations, dsd .getInstanceType(), arg.type, dsd.arg1Feat, arg.feat); if(arg.arrs.numTypes>0) gatedoc2NLPFeaturesArg(annotations, arg.type, arg.arrs.typesInDataSetDef, arg.arrs.featuresInDataSetDef, arg.arrs.namesInDataSetDef, arg.arrs.featurePosition, isArgInRel, initialPosition); if(arg.arrs.numNgrams>0) gatedoc2NgramFeaturesArg(annotations, arg.type, arg.ngrams, isArgInRel, initialPosition); initialPosition += arg.maxTotalPosition+1; } if(dsd.arg2 != null) { ArgOfRelation arg = dsd.arg2; boolean[][] isArgInRel = matchArgInstanceWithInst(annotations, dsd .getInstanceType(), arg.type, dsd.arg1Feat, arg.feat); if(arg.arrs.numTypes>0) gatedoc2NLPFeaturesArg(annotations, arg.type, arg.arrs.typesInDataSetDef, arg.arrs.featuresInDataSetDef, arg.arrs.namesInDataSetDef, arg.arrs.featurePosition, isArgInRel, initialPosition); if(arg.arrs.numNgrams>0) gatedoc2NgramFeaturesArg(annotations, arg.type, arg.ngrams, isArgInRel, initialPosition); } if(dsd.relAttributes != null) gatedoc2NLPFeaturesRel(annotations, dsd.getInstanceType(), dsd.arg1Feat, dsd.arg2Feat, dsd.arrs.typesInDataSetDef, dsd.arrs.featuresInDataSetDef, dsd.arrs.namesInDataSetDef, dsd.arrs.arg1s, dsd.arrs.arg2s, dsd.arrs.featurePosition); // get the label from the class attribute gatedoc2LabelsCompleteRel(annotations, dsd.getInstanceType(), dsd.arg1Feat, dsd.arg2Feat, dsd.arrs.classType, dsd.arrs.classFeature, dsd.arrs.classArg1, dsd.arrs.classArg2); } else { // get the NLP features from the attributes if(dsd.arrs.numTypes > 0) gatedoc2NLPFeatures(annotations, dsd.getInstanceType(), dsd.arrs.typesInDataSetDef, dsd.arrs.featuresInDataSetDef, dsd.arrs.namesInDataSetDef, dsd.arrs.featurePosition); // it if(dsd.arrs.numNgrams > 0) gatedoc2NgramFeatures(annotations, dsd.getInstanceType(), dsd .getNgrams()); // get the label from the class attribute gatedoc2LabelsComplete(annotations, dsd.getInstanceType(), dsd.arrs.classType, dsd.arrs.classFeature); } } /** Get the N-gram features from the GATE document. */ public void gatedoc2NgramFeatures(AnnotationSet annotations, String instanceType, java.util.List ngrams) { AnnotationSet anns = annotations.get(instanceType); ArrayList annotationArray = (anns == null || anns.isEmpty()) ? new ArrayList() : new ArrayList(anns); Collections.sort(annotationArray, new OffsetComparator()); if(numInstances != annotationArray.size()) { System.out.println("!!Warning: the number of instances " + new Integer(numInstances) + " in the document " + docId + " is not right!!!"); return; } int numNgrams = ngrams.size(); // For each ngram for(int i1 = 0; i1 < numNgrams; ++i1) { Ngram ngram = (Ngram)ngrams.get(i1); String nameOfNgram = ngram.getName(); int ngramPosition = ngram.position; String positionStr = obtainPositionStr(ngramPosition); featuresName.append(nameOfNgram + ConstantParameters.ITEMSEPARATOR); int consNum= ngram.getConsnum(); String [] typeGateNgram = new String[consNum]; String [] featureGateNgram = new String[consNum]; for(int j=0; j<consNum; ++j) { typeGateNgram[j] = (ngram.getTypessGate())[j]; featureGateNgram[j] = (ngram.getFeaturesGate())[j]; } AnnotationSet [] annsArray = new AnnotationSet[consNum]; for(int j=0; j<consNum; ++j) { annsArray[j] = (AnnotationSet)annotations.get(typeGateNgram[j]); } for(int i = 0; i < numInstances; ++i) { Annotation annToken = (Annotation)annotationArray.get(i); Long tokenStartOffset = annToken.getStartNode().getOffset(); Long tokenEndOffset = annToken.getEndNode().getOffset(); //AnnotationSet annsNgramType = annotations.get(typeGateNgram, // tokenStartOffset, tokenEndOffset); AnnotationSet annsNgramType = annsArray[0].get(tokenStartOffset, tokenEndOffset); String[] features; features = obtainNgramFeatures(annsNgramType, featureGateNgram[0]); int numFeats = features.length; int number = ngram.getNumber(); if(numFeats>=number) { //if the instance has enough number of features for the defined ngram for(int j = 1; j < consNum; j++) { String[] features1; if(typeGateNgram[j].equals(typeGateNgram[0])) features1 = obtainNgramFeatures(annsNgramType, featureGateNgram[j]); else features1 = obtainNgramFeaturesFromDifferentType(annsNgramType, annsArray[j].get(tokenStartOffset, tokenEndOffset), featureGateNgram[j]); for(int j1 = 0; j1 < features.length; ++j1) features[j1] = features[j1] + "_" + features1[j1]; } // get the ngram features StringBuffer[] featuresNgram = new StringBuffer[numFeats - number + 1]; for(int j = 0; j < featuresNgram.length; ++j) featuresNgram[j] = new StringBuffer(); for(int j = 0; j < number; ++j) { for(int j1 = j; j1 < numFeats - number + 1 + j; ++j1) { featuresNgram[j1 - j].append(features[j1] + NLPFeaturesList.SYMBOLNGARM); } } Hashtable ngramTerms = new Hashtable(); for(int j = 0; j < featuresNgram.length; ++j) if(!ngramTerms.containsKey(featuresNgram[j].toString())) ngramTerms.put(featuresNgram[j].toString(), "1"); else ngramTerms.put(featuresNgram[j].toString(), new Integer((new Integer(ngramTerms .get(featuresNgram[j].toString()).toString())).intValue() + 1)); List keys = new ArrayList(ngramTerms.keySet()); Collections.sort(keys); Iterator iterator = keys.iterator(); if(featuresInLine[i] == null) featuresInLine[i] = new StringBuffer(); while(iterator.hasNext()) { Object key = iterator.next(); if(ngramPosition != 0) this.featuresInLine[i].append(obtainFeatureName(nameOfNgram, key .toString() + NLPFeaturesList.SYMBOLNGARM + ngramTerms.get(key).toString()) + positionStr + ConstantParameters.ITEMSEPARATOR); else this.featuresInLine[i].append(obtainFeatureName(nameOfNgram, key .toString() + NLPFeaturesList.SYMBOLNGARM + ngramTerms.get(key).toString()) + ConstantParameters.ITEMSEPARATOR); ++featuresCounted[i]; } }//if the number of features is not less than the n of the n-gram }// end of the loop on instances } // end of the loop on number of ngrams } /** * Obtain the string for the position, which is attached at the end of the nlp * feature. */ String obtainPositionStr(int ngramPosition) { return "[" + (new Integer(ngramPosition)).toString() + "]"; } /** Obtain the N-gram features from an annotation set. */ private String[] obtainNgramFeatures(AnnotationSet annsNgramType, String gateFeature) { int num = annsNgramType.size(); String[] feats = new String[num]; ArrayList annotationArray = (annsNgramType == null || annsNgramType .isEmpty()) ? new ArrayList() : new ArrayList(annsNgramType); Collections.sort(annotationArray, new OffsetComparator()); for(int i = 0; i < num; ++i) { feats[i] = (String)((Annotation)annotationArray.get(i)).getFeatures() .get(gateFeature); if(feats[i]==null) feats[i] = ConstantParameters.NAMENONFEATURE; feats[i] = feats[i].trim().replaceAll(ConstantParameters.ITEMSEPARATOR, ConstantParameters.ITEMSEPREPLACEMENT); } return feats; } /** * Obtain the N-gram features from an annotation set for the Annotation type * which is different from the instance's type. */ private String[] obtainNgramFeaturesFromDifferentType( AnnotationSet annsNgramType, AnnotationSet annsCurrent, String gateFeature) { int num = annsNgramType.size(); String[] feats = new String[num]; ArrayList annotationArray = (annsNgramType == null || annsNgramType .isEmpty()) ? new ArrayList() : new ArrayList(annsNgramType); Collections.sort(annotationArray, new OffsetComparator()); for(int i = 0; i < num; ++i) { feats[i] = obtainAnnotationForTypeAndFeature(annsCurrent, gateFeature, ((Annotation)(annotationArray.get(i))).getStartNode().getOffset(), ((Annotation)(annotationArray.get(i))).getEndNode().getOffset()); if(feats[i] != null) feats[i] = feats[i].trim().replaceAll(ConstantParameters.ITEMSEPARATOR, ConstantParameters.ITEMSEPREPLACEMENT); } return feats; } /** Get the labels of each instance in the document. */ public void gatedoc2LabelsComplete(AnnotationSet annotations, String instanceType, String classType, String classFeature) { AnnotationSet anns = annotations.get(instanceType); ArrayList annotationArray = (anns == null || anns.isEmpty()) ? new ArrayList() : new ArrayList(anns); Collections.sort(annotationArray, new OffsetComparator()); if(numInstances != annotationArray.size()) { System.out.println("!!Warning: the number of instances " + new Integer(numInstances) + " in the document " + docId + " is not right!!!"); return; } // For each of entity AnnotationSet annsEntity = annotations.get(classType); for(Object obj : annsEntity) { Annotation annEntity = (Annotation)obj; if(annEntity.getFeatures().get(classFeature) == null) continue; String featName = annEntity.getFeatures().get(classFeature).toString(); featName = featName.trim(); featName = featName.replaceAll(ConstantParameters.SUFFIXSTARTTOKEN, ConstantParameters.SUFFIXSTARTTOKEN + "_"); featName = featName.replaceAll(ConstantParameters.ITEMSEPARATOR, "_"); //Get the multilabel from one instance String [] featNameArray = featName.split(ConstantParameters.MULTILABELSEPARATOR); boolean isStart = true; for(int i = 0; i < numInstances; ++i) { Annotation annToken = (Annotation)annotationArray.get(i); if(annToken.overlaps(annEntity)) { String featName0 = ""; if(isStart) { for(int j=0; j<featNameArray.length; ++j) { if(j>0) featName0 += ConstantParameters.ITEMSEPARATOR; featName0 += featNameArray[j]+ConstantParameters.SUFFIXSTARTTOKEN; } isStart = false; } else for(int j=0; j<featNameArray.length; ++j) { if(j>0) featName0 += ConstantParameters.ITEMSEPARATOR; featName0 += featNameArray[j]; } if(featName0.length() > 0) { if(this.classNames[i] != null) this.classNames[i] += ConstantParameters.ITEMSEPARATOR + featName0; else this.classNames[i] = featName0; } } } } } /** Get the Attribute feature for each instance of the document. */ public void gatedoc2NLPFeatures(AnnotationSet annotations, String instanceType, String[] typesGate, String[] featuresGate, String[] namesGate, int[] featurePosition) { int numTypes = typesGate.length; this.totalnumTypes += numTypes; for(int i = 0; i < numTypes; ++i) { this.featuresName.append(namesGate[i] + ConstantParameters.ITEMSEPARATOR); } String[] positionArrStr = new String[numTypes]; for(int i = 0; i < numTypes; ++i) { if(featurePosition[i] != 0) positionArrStr[i] = obtainPositionStr(featurePosition[i]); } AnnotationSet anns = annotations.get(instanceType); ArrayList<Annotation>annotationArray = (anns == null || anns.isEmpty()) ? new ArrayList<Annotation>() : new ArrayList<Annotation>(anns); Collections.sort(annotationArray, new OffsetComparator()); String[] features = new String[numTypes]; int numInstances0 = annotationArray.size(); AnnotationSet [] annsArray = new AnnotationSet[numTypes]; for(int j=0; j<numTypes; ++j) { annsArray[j] = (AnnotationSet)annotations .get(typesGate[j]); } for(int i = 0; i < numInstances0; ++i) { // for class Annotation annToken; for(int j = 0; j < numTypes; j++) { // for each attribute in different positions, get the token in // the corresponding position if(featurePosition[j] == 0) annToken = (Annotation)annotationArray.get(i); else if((featurePosition[j] < 0 && i + featurePosition[j] >= 0) || (featurePosition[j] > 0 && i + featurePosition[j] < numInstances0)) annToken = (Annotation)annotationArray.get(i + featurePosition[j]); else continue; if(typesGate[j].equals(instanceType)) { features[j] = (String)annToken.getFeatures().get(featuresGate[j]); } else { // if not belongs to token Long tokenStartOffset = annToken.getStartNode().getOffset(); Long tokenEndOffset = annToken.getEndNode().getOffset(); features[j] = obtainAnnotationForTypeAndFeature(annsArray[j], featuresGate[j], tokenStartOffset, tokenEndOffset); } // put the name into the feature name if(features[j] != null) { features[j] = features[j].trim().replaceAll( ConstantParameters.ITEMSEPARATOR, ConstantParameters.ITEMSEPREPLACEMENT); features[j] = obtainFeatureName(namesGate[j], features[j]); } }// end of the loop on the types int numCounted = 0; if(featuresInLine[i] == null) featuresInLine[i] = new StringBuffer(); for(int j = 0; j < numTypes; ++j) { if(features[j] != null) { ++numCounted; if(featurePosition[j]!=0) this.featuresInLine[i].append(features[j] + positionArrStr[j]+ConstantParameters.ITEMSEPARATOR); else this.featuresInLine[i].append(features[j] + ConstantParameters.ITEMSEPARATOR); } else { if(featurePosition[j]!=0) this.featuresInLine[i].append(ConstantParameters.NAMENONFEATURE + positionArrStr[j]+ConstantParameters.ITEMSEPARATOR); else this.featuresInLine[i].append(ConstantParameters.NAMENONFEATURE +ConstantParameters.ITEMSEPARATOR); } featuresCounted[i] += numCounted; } }// end of the loop on instances } /** Get the N-gram features from the GATE document. */ public void gatedoc2NgramFeaturesArg(AnnotationSet annotations, String instanceType, java.util.List ngrams, boolean[][] isArgInRel, int initialPosition) { AnnotationSet anns = annotations.get(instanceType); ArrayList<Annotation>annotationArray = (anns == null || anns.isEmpty()) ? new ArrayList<Annotation>() : new ArrayList<Annotation>(anns); Collections.sort(annotationArray, new OffsetComparator()); int numInstances0 = annotationArray.size(); int numNgrams = ngrams.size(); // For each ngram for(int i1 = 0; i1 < numNgrams; ++i1) { Ngram ngram = (Ngram)ngrams.get(i1); String nameOfNgram = ngram.getName(); int ngramPosition = ngram.position; if(ngramPosition>=0) ngramPosition += initialPosition; else ngramPosition -= initialPosition; String positionStr = obtainPositionStr(ngramPosition); featuresName.append(nameOfNgram + ConstantParameters.ITEMSEPARATOR); int consNum= ngram.getConsnum(); //String typeGateNgram = (ngram.getTypessGate())[0]; String [] typeGateNgram = new String[consNum]; String [] featureGateNgram = new String[consNum]; for(int j=0; j<consNum; ++j) { typeGateNgram[j] = (ngram.getTypessGate())[j]; featureGateNgram[j] = (ngram.getFeaturesGate())[j]; } AnnotationSet [] annsArray = new AnnotationSet[consNum]; for(int j=0; j<consNum; ++j) { annsArray[j] = (AnnotationSet)annotations.get(typeGateNgram[j]); } for(int i = 0; i < numInstances0; ++i) { Annotation annToken = annotationArray.get(i); Long tokenStartOffset = annToken.getStartNode().getOffset(); Long tokenEndOffset = annToken.getEndNode().getOffset(); AnnotationSet annsNgramType = annsArray[0].get(tokenStartOffset, tokenEndOffset); String[] features = obtainNgramFeatures(annsNgramType, featureGateNgram[0]); int numFeats = features.length; int number = ngram.getNumber(); if(numFeats>=number) { for(int j = 1; j < consNum; j++) { String[] features1; if(typeGateNgram[j].equals(typeGateNgram[0])) features1 = obtainNgramFeatures(annsNgramType, featureGateNgram[j]); else features1 = obtainNgramFeaturesFromDifferentType(annsNgramType, annsArray[j].get(tokenStartOffset, tokenEndOffset), featureGateNgram[j]); for(int j1 = 0; j1 < features.length; ++j1) features[j1] = features[j1] + "_" + features1[j1]; } // get the ngram features StringBuffer[] featuresNgram = new StringBuffer[numFeats - number + 1]; for(int j = 0; j < featuresNgram.length; ++j) featuresNgram[j] = new StringBuffer(); for(int j = 0; j < number; ++j) { for(int j1 = j; j1 < numFeats - number + 1 + j; ++j1) { featuresNgram[j1 - j].append(features[j1] + NLPFeaturesList.SYMBOLNGARM); } } Hashtable<String,Integer>ngramTerms = new Hashtable<String,Integer>(); for(int j = 0; j < featuresNgram.length; ++j) if(!ngramTerms.containsKey(featuresNgram[j].toString())) ngramTerms.put(featuresNgram[j].toString(), new Integer(1)); else ngramTerms.put(featuresNgram[j].toString(), new Integer(ngramTerms .get(featuresNgram[j].toString()).intValue() + 1)); List<String>keys = new ArrayList<String>(ngramTerms.keySet()); Collections.sort(keys); //Iterator iterator = keys.iterator(); //while(iterator.hasNext()) { for(int iK=0; iK<keys.size(); ++iK) { //Object key = iterator.next(); String key = keys.get(iK); //For each relation data with the current one as its argument for(int ii = 0; ii < numInstances; ++ii) { if(isArgInRel[i][ii]) { if(featuresInLine[ii] == null) featuresInLine[ii] = new StringBuffer(); if(ngramPosition != 0) this.featuresInLine[ii].append(obtainFeatureName(nameOfNgram, key + NLPFeaturesList.SYMBOLNGARM + ngramTerms.get(key)) + positionStr + ConstantParameters.ITEMSEPARATOR); else this.featuresInLine[ii].append(obtainFeatureName(nameOfNgram, key + NLPFeaturesList.SYMBOLNGARM + ngramTerms.get(key)) + ConstantParameters.ITEMSEPARATOR); ++featuresCounted[ii]; } }//for each instance } } }// end of the loop on instances } // end of the loop on number of ngrams } /** Get the NLP feature for the argument feature of relation data. */ public void gatedoc2NLPFeaturesArg(AnnotationSet annotations, String instanceType, String[] typesGate, String[] featuresGate, String[] namesGate, int[] featurePosition, boolean[][] isArgInRel, int initialPosition) { int numTypes = typesGate.length; this.totalnumTypes += numTypes; for(int i = 0; i < numTypes; ++i) { this.featuresName.append(namesGate[i] + ConstantParameters.ITEMSEPARATOR); } int [] positionNum = new int[numTypes]; String[] positionArrStr = new String[numTypes]; for(int i = 0; i < numTypes; ++i) { if(featurePosition[i]>=0) positionNum[i] = featurePosition[i] + initialPosition; else positionNum[i] = featurePosition[i] - initialPosition; if(positionNum[i] != 0) positionArrStr[i] = obtainPositionStr(featurePosition[i]); } AnnotationSet anns = annotations.get(instanceType); ArrayList annotationArray = (anns == null || anns.isEmpty()) ? new ArrayList() : new ArrayList(anns); Collections.sort(annotationArray, new OffsetComparator()); String[] features = new String[numTypes]; int numInstances0 = annotationArray.size(); AnnotationSet [] annsArray = new AnnotationSet[numTypes]; for(int j=0; j<numTypes; ++j) { annsArray[j] = (AnnotationSet)annotations .get(typesGate[j]); } for(int i = 0; i < numInstances0; ++i) { // for class Annotation annToken; for(int j = 0; j < numTypes; j++) { // for each attribute in different positions, get the token in // the corresponding position if(featurePosition[j] == 0) annToken = (Annotation)annotationArray.get(i); else if((featurePosition[j] < 0 && i + featurePosition[j] >= 0) || (featurePosition[j] > 0 && i + featurePosition[j] < numInstances0)) annToken = (Annotation)annotationArray.get(i + featurePosition[j]); else continue; if(typesGate[j].equals(instanceType)) { features[j] = (String)annToken.getFeatures().get(featuresGate[j]);// types[i]; // //(String)annToken.getFeatures().get(attr.getFeature()); } else { // if not belongs to token Long tokenStartOffset = annToken.getStartNode().getOffset(); Long tokenEndOffset = annToken.getEndNode().getOffset(); features[j] = obtainAnnotationForTypeAndFeature(annsArray[j], featuresGate[j], tokenStartOffset, tokenEndOffset); } // put the name into the feature name if(features[j] != null) { features[j] = features[j].trim().replaceAll( ConstantParameters.ITEMSEPARATOR, ConstantParameters.ITEMSEPREPLACEMENT); features[j] = obtainFeatureName(namesGate[j], features[j]); } }// end of the loop on the types // For each relation data with the current one as its argument for(int ii = 0; ii < numInstances; ++ii) { if(isArgInRel[i][ii]) { int numCounted = 0; if(featuresInLine[ii] == null) featuresInLine[ii] = new StringBuffer(); for(int j = 0; j < numTypes; ++j) { if(features[j] instanceof String) { ++numCounted; if(positionNum[j]!=0) this.featuresInLine[ii].append(features[j] + positionArrStr[j]+ ConstantParameters.ITEMSEPARATOR); else this.featuresInLine[ii].append(features[j] + ConstantParameters.ITEMSEPARATOR); } else if(positionNum[j]!=0) this.featuresInLine[ii].append(ConstantParameters.NAMENONFEATURE + positionArrStr[j]+ConstantParameters.ITEMSEPARATOR); else this.featuresInLine[ii].append(ConstantParameters.NAMENONFEATURE + ConstantParameters.ITEMSEPARATOR); } featuresCounted[ii] += numCounted; } } }// end of the loop on instances } /** Match the argument instance with the relation instance. */ boolean[][] matchArgInstanceWithInst(AnnotationSet annotations, String relInstanceType, String instanceType, String relArgF, String argF) { // Get the intance array AnnotationSet anns = annotations.get(instanceType); ArrayList annotationArray = (anns == null || anns.isEmpty()) ? new ArrayList() : new ArrayList(anns); Collections.sort(annotationArray, new OffsetComparator()); // Get the relation intance array AnnotationSet relAnns = annotations.get(relInstanceType); ArrayList relAnnotationArray = (relAnns == null || relAnns.isEmpty()) ? new ArrayList() : new ArrayList(relAnns); Collections.sort(relAnnotationArray, new OffsetComparator()); // Assign the match boolean[][] isArgInRel = new boolean[annotationArray.size()][relAnnotationArray .size()]; for(int i = 0; i < annotationArray.size(); ++i) { Annotation ann = (Annotation)annotationArray.get(i); String argV = ann.getFeatures().get(argF).toString(); for(int ii = 0; ii < relAnnotationArray.size(); ++ii) { String argRelV = ((Annotation)relAnnotationArray.get(ii)).getFeatures() .get(relArgF).toString(); if(argV.equals(argRelV)) isArgInRel[i][ii] = true; else isArgInRel[i][ii] = false; } } return isArgInRel; } /** Get the annotation with different type from the instance. */ String obtainAnnotationForTypeAndFeature(AnnotationSet singleAnnSet, String gateFeature, Long tokenStartOffset, Long tokenEndOffset) { if(singleAnnSet instanceof AnnotationSet) { AnnotationSet coverAnnSet = (AnnotationSet)singleAnnSet.get( tokenStartOffset, tokenEndOffset); Iterator overlappingIterator = coverAnnSet.iterator(); if(overlappingIterator.hasNext()) { Annotation superannotation = (Annotation)overlappingIterator.next(); return (String)superannotation.getFeatures().get(gateFeature); } } return null; } /** * Get the annotation with different type from the instance for relation * learning. */ String obtainAnnotationForTypeAndFeatureRel(String arg1V, String arg2V, AnnotationSet singleAnnSet, String gateFeature, String arg1F, String arg2F) { if(singleAnnSet instanceof AnnotationSet) { Iterator overlappingIterator = singleAnnSet.iterator(); if(overlappingIterator.hasNext()) { Annotation superannotation = (Annotation)overlappingIterator.next(); FeatureMap feat0 = superannotation.getFeatures(); if(arg1V.equals(feat0.get(arg1F)) && arg2V.equals(feat0.get(arg2F))) { String feat = feat0.get(gateFeature).toString(); return feat; } } } return null; } /** * Get the Attribute-Rel features from annotations for relation learning. */ public void gatedoc2NLPFeaturesRel(AnnotationSet annotations, String instanceType, String arg1Inst, String arg2Inst, String[] typesGate, String[] featuresGate, String[] namesGate, String[] arg1s, String[] arg2s, int[] featurePosition) { int numTypes = typesGate.length; this.totalnumTypes += numTypes; for(int i = 0; i < numTypes; ++i) { this.featuresName.append(namesGate[i] + ConstantParameters.ITEMSEPARATOR); } String [] strPosition = new String[numTypes]; for(int i=0; i<numTypes; ++i) { if(featurePosition[i]!=0) strPosition[i] = obtainPositionStr(featurePosition[i]); } AnnotationSet anns = annotations.get(instanceType); ArrayList annotationArray = (anns == null || anns.isEmpty()) ? new ArrayList() : new ArrayList(anns); Collections.sort(annotationArray, new OffsetComparator()); if(numInstances != annotationArray.size()) { System.out.println("!!Warning: the number of instances " + new Integer(numInstances) + " in the document " + docId + " is not right!!!"); return; } AnnotationSet [] annsArray = new AnnotationSet[numTypes]; for(int j=0; j<numTypes; ++j) { annsArray[j] = (AnnotationSet)annotations .get(typesGate[j]); } String[] features = new String[numTypes]; for(int i = 0; i < numInstances; ++i) { // for class Annotation annToken; for(int j = 0; j < numTypes; j++) { // for each attribute in different positions, get the token in // the corresponding position if(featurePosition[j] == 0) annToken = (Annotation)annotationArray.get(i); else if((featurePosition[j] < 0 && i + featurePosition[j] >= 0) || (featurePosition[j] > 0 && i + featurePosition[j] < numInstances)) annToken = (Annotation)annotationArray.get(i + featurePosition[j]); else continue; FeatureMap feat = annToken.getFeatures(); String arg1Value = feat.get(arg1s[j]).toString(); String arg2Value = feat.get(arg2s[j]).toString(); if(typesGate[j].equals(instanceType)) { if(arg1Value.equals(feat.get(arg1Inst)) && arg2Value.equals(feat.get(arg2Inst))) features[j] = feat.get(featuresGate[j]).toString();// types[i]; // //(String)annToken.getFeatures().get(attr.getFeature()); } else { // if not belongs to token features[j] = obtainAnnotationForTypeAndFeatureRel(arg1Value, arg2Value, annsArray[j], featuresGate[j], arg1s[j], arg2s[j]); } // put the name into the feature name if(features[j] != null) { features[j] = features[j].trim().replaceAll( ConstantParameters.ITEMSEPARATOR, ConstantParameters.ITEMSEPREPLACEMENT); features[j] = obtainFeatureName(namesGate[j], features[j]); } }// end of the loop on the types int numCounted = 0; if(featuresInLine[i] == null) featuresInLine[i] = new StringBuffer(); for(int j = 0; j < numTypes; ++j) if(features[j] instanceof String) { ++numCounted; if(featurePosition[j]!=0) this.featuresInLine[i].append(features[j] +strPosition[j]+ConstantParameters.ITEMSEPARATOR); else this.featuresInLine[i].append(features[j] + ConstantParameters.ITEMSEPARATOR); } else if(featurePosition[j]!=0) this.featuresInLine[i].append(ConstantParameters.NAMENONFEATURE +strPosition[j] + ConstantParameters.ITEMSEPARATOR); else this.featuresInLine[i].append(ConstantParameters.NAMENONFEATURE + ConstantParameters.ITEMSEPARATOR); featuresCounted[i] = numCounted; }// end of the loop on instances } /** Get the label for the relation learning. */ public void gatedoc2LabelsCompleteRel(AnnotationSet annotations, String instanceType, String arg1Inst, String arg2Inst, String classType, String classFeature, String arg1C, String arg2C) { AnnotationSet anns = annotations.get(instanceType); ArrayList annotationArray = (anns == null || anns.isEmpty()) ? new ArrayList() : new ArrayList(anns); Collections.sort(annotationArray, new OffsetComparator()); if(numInstances != annotationArray.size()) { System.out.println("!!Warning: the number of instances " + new Integer(numInstances) + " in the document " + docId + " is not right!!!"); return; } // For each of entity AnnotationSet annsEntity = annotations.get(classType); for(Object obj : annsEntity) { Annotation annEntity = (Annotation)obj; if(annEntity.getFeatures().get(classFeature) == null) continue; String featName = annEntity.getFeatures().get(classFeature).toString(); featName = featName.trim(); featName = featName.replaceAll(ConstantParameters.SUFFIXSTARTTOKEN, ConstantParameters.SUFFIXSTARTTOKEN + "_"); // Get the values of the entity args String arg1CV = annEntity.getFeatures().get(arg1C).toString(); String arg2CV = annEntity.getFeatures().get(arg2C).toString(); boolean isStart = true; for(int i = 0; i < numInstances; ++i) { Annotation annToken = (Annotation)annotationArray.get(i); FeatureMap feats = annToken.getFeatures(); if(arg1CV.equals(feats.get(arg1Inst)) && arg2CV.equals(feats.get(arg2Inst))) { String featName0 = featName; if(isStart) { featName0 += ConstantParameters.SUFFIXSTARTTOKEN; isStart = false; } if(featName0.length() > 0) { if(this.classNames[i] instanceof String) this.classNames[i] += ConstantParameters.ITEMSEPARATOR + featName0; else this.classNames[i] = featName0; } } } } } /** Write the NLP data into a file. */ public void writeNLPFeaturesToFile(BufferedWriter out, String docId, int docIndex, int[] featurePosition) { if(LogService.minVerbosityLevel > 1) System.out.println("number=" + new Integer(numInstances)); try { if(docIndex == 0) { StringBuffer sline = new StringBuffer("Class(es)"); String[] featNs = this.featuresName.toString().split( ConstantParameters.ITEMSEPARATOR); for(int i = 0; i < featNs.length; ++i) if(featurePosition.length > i) sline.append(ConstantParameters.ITEMSEPARATOR + featNs[i] + "(" + featurePosition[i] + ")"); else sline.append(ConstantParameters.ITEMSEPARATOR + featNs[i]); out.write(sline.toString()); out.newLine(); } out.write(new Integer(docIndex) + ConstantParameters.ITEMSEPARATOR + docId + ConstantParameters.ITEMSEPARATOR + new Integer(numInstances)); out.newLine(); for(int i = 0; i < numInstances; ++i) { if(classNames[i] instanceof String) { int num = classNames[i].split(ConstantParameters.ITEMSEPARATOR).length; out.write(num + ConstantParameters.ITEMSEPARATOR + classNames[i] + ConstantParameters.ITEMSEPARATOR + this.featuresInLine[i].toString().trim()); } else out.write("0" + ConstantParameters.ITEMSEPARATOR + this.featuresInLine[i].toString().trim()); out.newLine(); } } catch(IOException e) { System.out.println("Error occured in writing the NLP data to a file!"); } } /** Read the NLP data of one document from the NLP feature file. */ public void readNLPFeaturesFromFile(BufferedReader in) { try { String [] lineItems = in.readLine().split(ConstantParameters.ITEMSEPARATOR); numInstances = Integer.parseInt(lineItems[2]); docId = lineItems[1]; featuresInLine = new StringBuffer[numInstances]; classNames = new String[numInstances]; int num; for(int i=0; i<numInstances; ++i) { String [] lineItems1 = in.readLine().split(ConstantParameters.ITEMSEPARATOR); num = Integer.parseInt(lineItems1[0]); if(num>0) { StringBuffer classNs = new StringBuffer(); for(int j=1; j<num; ++j) classNs.append(lineItems1[j]+ConstantParameters.ITEMSEPARATOR); classNs.append(lineItems1[num]); classNames[i] = classNs.toString(); } featuresInLine[i] = new StringBuffer(); if(num+1<lineItems1.length) featuresInLine[i].append(lineItems1[num+1]); for(int j=num+2; j<lineItems1.length; ++j) featuresInLine[i].append(ConstantParameters.ITEMSEPARATOR+lineItems1[j]); } } catch(IOException e) { System.out.println("**Error occured in reading the NLP data from file for converting to FVs!"); } } public void setDocId(String docId) { this.docId = new String(docId); } public String getDocId() { return this.docId; } /** Put the type and feature together. */ static String obtainFeatureName(String type, String feat) { return ConstantParameters.ITEMSEPREPLACEMENT + type + ConstantParameters.ITEMSEPREPLACEMENT + feat; } }