/* Copyright (C) 2004 Univ. of Pennsylvania
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package edu.upenn.cis.taggers.frontend;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import edu.upenn.cis.taggers.LoadModelException;
import edu.upenn.cis.taggers.Tagger;
import edu.upenn.cis.taggers.gene.GeneTagger;
import edu.upenn.cis.taggers.malignancy.MalignancyTagger;
import edu.upenn.cis.taggers.variation.VariationTagger;
/**
* Facilitates tagging of a batch of MEDLINE articles
* @author Kevin Lerman <a href="mailto:klerman@seas.upenn.edu">klerman@seas.upenn.edu</a>
* @author Ryan McDonald <a href="mailto:ryantm@cis.upenn.edu">ryantm@cis.upenn.edu</a>
* */
public class BatchTagger{
/**
* Loads the specified tagger with the specified model, and passes the input file through it to be tagged.
* Outputs MEDLINE and HTML to the output mask.txt and .html.
* Usage: BatchTagger [tagger] [model] [input] [output mask]
* */
public static void main(String[] args) throws IOException, ClassNotFoundException{
String usageHelp = "Usage: BatchTagger <tagger> <model> <input> <output mask>";
String taggerHelp = "Taggers supported:\nvar -- Variation Tagger\ngene -- Gene Tagger\nmal -- Malignancy Tagger";
if(args.length!=4){System.out.println(usageHelp+'\n'+taggerHelp);return;}
File source = new File(args[2]);
if(!source.exists() || !source.canRead()) {
System.err.println("Can't read source information: "+source.getAbsolutePath());
return;
}
if(args[3].indexOf("/") < 0)
args[3] = "./"+args[3];
File output = new File(args[3].substring(0,args[3].lastIndexOf("/")));
if(!output.exists() || !output.canWrite()) {
System.err.println("Can't write to ouput location: "+args[3]+" ("+output.getAbsolutePath()+")");
return;
}
BufferedReader in = new BufferedReader(new FileReader(source));
Tagger tagger = null;
try {
if(args[0].toLowerCase().equals("var")) tagger = new VariationTagger(args[1]);
else if(args[0].toLowerCase().equals("gene")) tagger = new GeneTagger(args[1]);
else if(args[0].toLowerCase().equals("mal")) tagger = new MalignancyTagger(args[1]);
else{System.out.println(usageHelp+'\n'+taggerHelp);return;}
} catch(LoadModelException lme) {
lme.printStackTrace();
System.err.println("Exception Loading Model: "+lme.getMessage());
return;
}
Article art=null;
String thisField="";
String thisValue="";
String thisLine = in.readLine();
if(thisLine.length()<4){
System.out.println("Invalid input file: No blank lines permitted before the first article");
return;
}
BufferedWriter outHTML = new BufferedWriter(new FileWriter(args[3]+".html"));
outHTML.write(tagger.htmlHeader());
BufferedWriter outMEDL = new BufferedWriter(new FileWriter(args[3]+".txt"));
if(!thisLine.substring(0,4).equals("PMID")){
//These aren't articles, it's just a block of text.
String myAbstract = "";
while(thisLine!=null){
myAbstract+=thisLine;
thisLine = in.readLine();
}
Article a = new Article(tagger);
a.add("PMID","000000000000");
a.add("TI","Test Article");
a.add("AB",myAbstract);
a.add("AU","Test");
a.add("DP","Test");
a.add("PT","Test");
a.add("SO","Test");
outputArticle(a,outMEDL,outHTML);
outMEDL.close();
outHTML.close();
return;
}
int artNum=0;
while(thisLine!=null){
if(thisLine.length()>0){
if(thisLine.charAt(0)!=' '){ //This is a new field
if(thisLine.substring(0,4).equals("PMID")){ //This is a new article
if(thisValue!=null && thisValue.length()>0){
art.add(thisField,thisValue);
thisValue="";
}
artNum++;
if(artNum%25==0) System.out.println(artNum);
if(art!=null){
outputArticle(art,outMEDL,outHTML);
}
art = new Article(tagger);
}
//All new fields will run this stuff
if(thisValue.length()>0){art.add(thisField,thisValue);thisValue="";}
thisField=thisLine.substring(0,4).trim();
thisValue=thisLine.substring(6,thisLine.length()).trim();
}
else{ //This is not a new field
thisLine=thisLine.trim();
thisValue+=" "+thisLine;
}
}
thisLine = in.readLine();
}
//This is stuff to process the last line/article/etc
if(thisValue.length()>0){art.add(thisField,thisValue);thisValue="";}
if(art!=null){
outputArticle(art,outMEDL,outHTML);
}
//End of code to process last thing
}
/**
* Tags the right fields and outputs this article in both MEDLINE and HTML format
* @param art The article to process
* @param outMEDL The MEDLINE file's BufferedWriter
* @param outHTML the HTML file's BufferedWriter
* */
private static void outputArticle(Article art, BufferedWriter outMEDL, BufferedWriter outHTML) throws IOException{
art.tag("AB");
art.tag("TI");
outMEDL.write(art.toString(Article.MEDLINE)+"\n");
outHTML.write(art.toString(Article.HTML)+"\n");
outMEDL.flush();
outHTML.flush();
}
}