/* Copyright (C) 2004 Univ. of Pennsylvania
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package edu.upenn.cis.taggers.gene;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Vector;
import edu.umass.cs.mallet.base.fst.Transducer;
import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.umass.cs.mallet.base.types.Sequence;
import edu.upenn.cis.taggers.Tag;
import edu.upenn.cis.taggers.TagList;
/**
* This class should never be called directly; it exists as an auxiliary to GeneTagger.
* @author Ryan McDonald <a href="mailto:ryantm@cis.upenn.edu">ryantm@cis.upenn.edu</a>
* @author Kevin Lerman <a href="mailto:klerman@seas.upenn.edu">Kevin Lerman</a>
*/
public class GeneSegmentationOutput
{
public TagList tag(String passedIn, Transducer crf, InstanceList testingdata) throws IOException {
return tag(passedIn, crf, testingdata, null);
}
/**
* This class's only method. It tags a properly formatted String according to the CRF model
* and returns it in XML format
* @param data The String to tag
* @param crf The model to use
* @param instanceData Used to generate the tags
* @return The XML-tagged String
* */
public TagList tag(String data, Transducer crf, InstanceList instanceData, Vector spans) throws IOException {
//BufferedReader in = new BufferedReader(new StringReader(passedIn));
//String toReturn="";
//String line = in.readLine();
//String sentence ="";
/*do {
if(line!=null && line.length() > 0) {
sentence += line + "\n";
}
else {
}
line = in.readLine();
} while(line != null);*/
//System.out.println("\nProcessing Data: "+data+"\n\n");
String[] tokens = data.split("\n");
//System.out.println("Tokens from sentence:");
//for(int x=0; x<tokens.length; x++) { System.out.println("TOKEN["+x+"]: "+tokens[x]); }
//System.out.println("\n\n");
Instance instance = instanceData.getInstance(instanceData.size()-1);
Sequence input = (Sequence) instance.getData();
Sequence predOutput = crf.viterbiPath(input).output();
String possibleGeneSegment = "";
int start = -1;
int end = -1;
Tag currentTag = null;
TagList tagList = new TagList();
boolean inAGene=false;
//Convert from BIO to XML
for(int j=0;j<tokens.length;j++){
String[] features = tokens[j].split(" ");
//System.out.println("Token Features: ");
//for(int y=0; y<features.length; y++) {
// System.out.println("FEATURE["+y+"]:"+features[y]);
//}
//grab the information about the token
possibleGeneSegment = features[0];
if(spans!=null && spans.get(j)!=null) {
start = ((int[])spans.get(j))[0];
end = ((int[])spans.get(j))[1];
}
if(predOutput.get(j).equals("B-GENE")){
if(inAGene) {
//must end the gene
//toReturn+=" </gene> ";
tagList.addTag(currentTag);
currentTag = null;
}
//toReturn+=" <gene> "; //either way, must start a new gene
inAGene=true;
currentTag = new Tag("gene");
}
else if(predOutput.get(j).equals("O")){
if(inAGene){
inAGene=false;
//toReturn+="</gene> ";
tagList.addTag(currentTag);
currentTag = null;
}
}
//if it was I we're just continuing the gene
//toReturn+=features[0]+" ";
//System.out.println("Possible Gene Segment: "+possibleGeneSegment);
if(inAGene) {
currentTag.addSegment(possibleGeneSegment,j);
currentTag.updateOffset(start, end);
}
}
if(inAGene){
//toReturn+="</gene>";
tagList.addTag(currentTag);
}
//return toReturn;
return tagList;
}
}