/*
* ChunkLengthStats.java
*
* Yaoyong Li 22/03/2007
*
* $Id: ChunkLengthStats.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
*/
package gate.learning;
import gate.Annotation;
import gate.AnnotationSet;
import gate.learning.DocFeatureVectors.LongCompactor;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
/**
* Store the length statitics of chunks belonging to one type which will be used
* by the post-processing procedure.
*/
public class ChunkLengthStats {
/**
* Index refer to the length of chunk, and value is the number of chunks with
* the length in training data.
*/
public int[] lenStats;
/** Maixmal length of a chunk considered. */
public final static int maxLen = 200;
/**
* Constructor Get an int array with the length pre-defined.
*/
public ChunkLengthStats() {
lenStats = new int[maxLen];
}
/** Read the chunk length statistics from a file specified. */
static public HashMap loadChunkLenStats(File parentDir, String filename) {
HashMap chunkLenHash = new HashMap();
File file1 = new File(parentDir, filename);
if(file1.exists()) {
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(
new File(parentDir, filename)), "UTF-8"));
String line;
while((line = in.readLine()) != null) {
line.trim();
int label;
int num;
String[] items = line.split(ConstantParameters.ITEMSEPARATOR);
label = Integer.parseInt(items[0]);
num = Integer.parseInt(items[1]);
ChunkLengthStats chunkLens;
if(chunkLenHash.containsKey(label)) {
chunkLens = (ChunkLengthStats)chunkLenHash.get(label);
} else
chunkLens = new ChunkLengthStats();
for(int i = 0; i < num; ++i) {
items = (in.readLine()).split(ConstantParameters.ITEMSEPARATOR);
chunkLens.lenStats[Integer.parseInt(items[0])] = Integer
.parseInt(items[1]);
}
chunkLenHash.put(label, chunkLens);
}
in.close();
} catch(IOException e) {
}
} else {
if(LogService.minVerbosityLevel > 0)
System.out
.println("No chunk length statistics list file in initialisation phrase.");
}
return chunkLenHash;
}
/** Write the chunk length statistics into a file. */
static public void writeChunkLensStatsIntoFile(File parentDir,
String filename, HashMap chunkLenHash) {
File file1 = new File(parentDir, filename);
try {
BufferedWriter out = new BufferedWriter(new OutputStreamWriter
(new FileOutputStream(new File(parentDir, filename)), "UTF-8"));
ArrayList labelSet = new ArrayList(chunkLenHash.keySet());
Collections.sort(labelSet, new LongCompactor());
for(int i = 0; i < labelSet.size(); ++i) {
Object obj = labelSet.get(i);
// if( chunkLenHash.containsKey(obj)) {
ChunkLengthStats chunkLens = (ChunkLengthStats)chunkLenHash.get(obj);
int num = 0;
for(int j = 0; j < ChunkLengthStats.maxLen; ++j)
if(chunkLens.lenStats[j] > 0) ++num;
out.append(Integer.parseInt(obj.toString())
+ ConstantParameters.ITEMSEPARATOR + num
+ ConstantParameters.ITEMSEPARATOR + "#label_and_number");
out.newLine();
// System.out.println("label=*"+Integer.parseInt(obj.toString())+
// "* num="+num );
for(int j = 0; j < ChunkLengthStats.maxLen; ++j)
if(chunkLens.lenStats[j] > 0) {
out.append(j + ConstantParameters.ITEMSEPARATOR
+ chunkLens.lenStats[j]);
out.newLine();
// System.out.println(" len=*"+j+"*
// num="+chunkLens.lenStats[j]);
}
// }
}
out.flush();
out.close();
} catch(IOException e) {
}
}
/**
* Update the chunk length statistics from the annotations according to data
* set defintion.
*/
public static void updateChunkLensStats(AnnotationSet annotations,
DataSetDefinition dsd, HashMap chunkLenHash, Label2Id label2Id) {
AnnotationSet annsC = annotations.get(dsd.classAttribute.getType());
String classFeat = dsd.classAttribute.getFeature();
AnnotationSet annsI = annotations.get(dsd.instanceType);
// For each annotation of class
for(Object obj : annsC) {
Annotation annC = (Annotation)obj;
if(annC.getFeatures().get(classFeat) != null) {
// Get the label
String feat = annC.getFeatures().get(classFeat).toString();
if(label2Id.label2Id.containsKey(feat)) {
String labelS = label2Id.label2Id.get(feat).toString();
int label = Integer.parseInt(labelS);
int num = 0;
// For each annotation of instance type
for(Object objI : annsI)
if(annC.overlaps((Annotation)objI)) ++num;
if(num < ChunkLengthStats.maxLen) {
// Update the chunk length statistics
if(chunkLenHash.containsKey(label)) {
ChunkLengthStats chunkLen = (ChunkLengthStats)chunkLenHash
.get(label);
chunkLen.lenStats[num] += 1;
chunkLenHash.put(label, chunkLen);
} else {
ChunkLengthStats chunkLen = new ChunkLengthStats();
chunkLen.lenStats[num] = 1;
chunkLenHash.put(label, chunkLen);
}
}
}
}
}
}
}