Log in Help
Print
Homereleasesgate-5.1-beta2-build3402-ALLpluginsLearningsrcgatelearning 〉 ChunkLengthStats.java
 
/*
 *  ChunkLengthStats.java
 * 
 *  Yaoyong Li 22/03/2007
 *
 *  $Id: ChunkLengthStats.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
 */
package gate.learning;

import gate.Annotation;
import gate.AnnotationSet;
import gate.learning.DocFeatureVectors.LongCompactor;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;

/**
 * Store the length statitics of chunks belonging to one type which will be used
 * by the post-processing procedure.
 */
public class ChunkLengthStats {
  /**
   * Index refer to the length of chunk, and value is the number of chunks with
   * the length in training data.
   */
  public int[] lenStats;
  /** Maixmal length of a chunk considered. */
  public final static int maxLen = 200;

  /**
   * Constructor Get an int array with the length pre-defined.
   */
  public ChunkLengthStats() {
    lenStats = new int[maxLen];
  }

  /** Read the chunk length statistics from a file specified. */
  static public HashMap loadChunkLenStats(File parentDir, String filename) {
    HashMap chunkLenHash = new HashMap();
    File file1 = new File(parentDir, filename);
    if(file1.exists()) {
      try {
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(
          new File(parentDir, filename)), "UTF-8"));
        String line;
        while((line = in.readLine()) != null) {
          line.trim();
          int label;
          int num;
          String[] items = line.split(ConstantParameters.ITEMSEPARATOR);
          label = Integer.parseInt(items[0]);
          num = Integer.parseInt(items[1]);
          ChunkLengthStats chunkLens;
          if(chunkLenHash.containsKey(label)) {
            chunkLens = (ChunkLengthStats)chunkLenHash.get(label);
          } else 
            chunkLens = new ChunkLengthStats();
          for(int i = 0; i < num; ++i) {
            items = (in.readLine()).split(ConstantParameters.ITEMSEPARATOR);
            chunkLens.lenStats[Integer.parseInt(items[0])] = Integer
              .parseInt(items[1]);
          }
          chunkLenHash.put(label, chunkLens);
        }
        in.close();
      } catch(IOException e) {
      }
    } else {
      if(LogService.minVerbosityLevel > 0)
        System.out
          .println("No chunk length statistics list file in initialisation phrase.");
    }
    
    return chunkLenHash;
  }

  /** Write the chunk length statistics into a file. */
  static public void writeChunkLensStatsIntoFile(File parentDir,
    String filename, HashMap chunkLenHash) {
    File file1 = new File(parentDir, filename);
    try {
      BufferedWriter out = new BufferedWriter(new OutputStreamWriter
        (new FileOutputStream(new File(parentDir, filename)), "UTF-8"));
      ArrayList labelSet = new ArrayList(chunkLenHash.keySet());
      Collections.sort(labelSet, new LongCompactor());
      for(int i = 0; i < labelSet.size(); ++i) {
        Object obj = labelSet.get(i);
        // if( chunkLenHash.containsKey(obj)) {
        ChunkLengthStats chunkLens = (ChunkLengthStats)chunkLenHash.get(obj);
        int num = 0;
        for(int j = 0; j < ChunkLengthStats.maxLen; ++j)
          if(chunkLens.lenStats[j] > 0) ++num;
        out.append(Integer.parseInt(obj.toString())
          + ConstantParameters.ITEMSEPARATOR + num
          + ConstantParameters.ITEMSEPARATOR + "#label_and_number");
        out.newLine();
        // System.out.println("label=*"+Integer.parseInt(obj.toString())+
        // "* num="+num );
        for(int j = 0; j < ChunkLengthStats.maxLen; ++j)
          if(chunkLens.lenStats[j] > 0) {
            out.append(j + ConstantParameters.ITEMSEPARATOR
              + chunkLens.lenStats[j]);
            out.newLine();
            // System.out.println(" len=*"+j+"*
            // num="+chunkLens.lenStats[j]);
          }
        // }
      }
      out.flush();
      out.close();
    } catch(IOException e) {
    }
  }

  /**
   * Update the chunk length statistics from the annotations according to data
   * set defintion.
   */
  public static void updateChunkLensStats(AnnotationSet annotations,
    DataSetDefinition dsd, HashMap chunkLenHash, Label2Id label2Id) {
    AnnotationSet annsC = annotations.get(dsd.classAttribute.getType());
    String classFeat = dsd.classAttribute.getFeature();
    AnnotationSet annsI = annotations.get(dsd.instanceType);
    // For each annotation of class
    for(Object obj : annsC) {
      Annotation annC = (Annotation)obj;
      if(annC.getFeatures().get(classFeat) != null) {
        // Get the label
        String feat = annC.getFeatures().get(classFeat).toString();
        if(label2Id.label2Id.containsKey(feat)) {
          String labelS = label2Id.label2Id.get(feat).toString();
          int label = Integer.parseInt(labelS);
          int num = 0;
          // For each annotation of instance type
          for(Object objI : annsI)
            if(annC.overlaps((Annotation)objI)) ++num;
          if(num < ChunkLengthStats.maxLen) {
            // Update the chunk length statistics
            if(chunkLenHash.containsKey(label)) {
              ChunkLengthStats chunkLen = (ChunkLengthStats)chunkLenHash
                .get(label);
              chunkLen.lenStats[num] += 1;
              chunkLenHash.put(label, chunkLen);
            } else {
              ChunkLengthStats chunkLen = new ChunkLengthStats();
              chunkLen.lenStats[num] = 1;
              chunkLenHash.put(label, chunkLen);
            }
          }
        }
      }
    }
  }
}