Log in Help
Print
HomegatepluginsTagger_PennBiosrceduupenncistaggersfrontend 〉 BatchTagger.java
 
/* Copyright (C) 2004 Univ. of Pennsylvania
 This software is provided under the terms of the Common Public License,
 version 1.0, as published by http://www.opensource.org.  For further
 information, see the file `LICENSE' included with this distribution. */

package edu.upenn.cis.taggers.frontend;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

import edu.upenn.cis.taggers.LoadModelException;
import edu.upenn.cis.taggers.Tagger;
import edu.upenn.cis.taggers.gene.GeneTagger;
import edu.upenn.cis.taggers.malignancy.MalignancyTagger;
import edu.upenn.cis.taggers.variation.VariationTagger;

/**
 * Facilitates tagging of a batch of MEDLINE articles
 * @author Kevin Lerman <a href="mailto:klerman@seas.upenn.edu">klerman@seas.upenn.edu</a>
 * @author Ryan McDonald <a href="mailto:ryantm@cis.upenn.edu">ryantm@cis.upenn.edu</a>
 * */
public class BatchTagger{
  
  /**
   * Loads the specified tagger with the specified model, and passes the input file through it to be tagged.
   * Outputs MEDLINE and HTML to the output mask.txt and .html.
   * Usage: BatchTagger [tagger] [model] [input] [output mask]
   * */
  public static void main(String[] args) throws IOException, ClassNotFoundException{
    String usageHelp = "Usage: BatchTagger <tagger> <model> <input> <output mask>";
    String taggerHelp = "Taggers supported:\nvar -- Variation Tagger\ngene -- Gene Tagger\nmal -- Malignancy Tagger";
    if(args.length!=4){System.out.println(usageHelp+'\n'+taggerHelp);return;}
    File source = new File(args[2]);
    if(!source.exists() || !source.canRead()) {
        System.err.println("Can't read source information: "+source.getAbsolutePath());
        return;
    }
    if(args[3].indexOf("/") < 0)
	args[3] = "./"+args[3];
    File output = new File(args[3].substring(0,args[3].lastIndexOf("/")));
    if(!output.exists() || !output.canWrite()) {
        System.err.println("Can't write to ouput location: "+args[3]+" ("+output.getAbsolutePath()+")");
        return;
    }
    BufferedReader in = new BufferedReader(new FileReader(source));
    Tagger tagger = null;
    try {
	    if(args[0].toLowerCase().equals("var")) tagger = new VariationTagger(args[1]); 
	    else if(args[0].toLowerCase().equals("gene")) tagger = new GeneTagger(args[1]);
	    else if(args[0].toLowerCase().equals("mal")) tagger = new MalignancyTagger(args[1]);
	    else{System.out.println(usageHelp+'\n'+taggerHelp);return;}
    } catch(LoadModelException lme) {
        lme.printStackTrace();
        System.err.println("Exception Loading Model: "+lme.getMessage());
        return;
    }
    Article art=null; 
    String thisField="";
    String thisValue="";
    String thisLine = in.readLine();
    if(thisLine.length()<4){
      System.out.println("Invalid input file: No blank lines permitted before the first article");
      return;
    }
    BufferedWriter outHTML = new BufferedWriter(new FileWriter(args[3]+".html"));
    outHTML.write(tagger.htmlHeader());
    BufferedWriter outMEDL = new BufferedWriter(new FileWriter(args[3]+".txt"));
    if(!thisLine.substring(0,4).equals("PMID")){
      //These aren't articles, it's just a block of text.
      String myAbstract = "";
      while(thisLine!=null){
        myAbstract+=thisLine;
        thisLine = in.readLine();
      }
      Article a = new Article(tagger);
      a.add("PMID","000000000000");
      a.add("TI","Test Article");
      a.add("AB",myAbstract);
      a.add("AU","Test");
      a.add("DP","Test");
      a.add("PT","Test");
      a.add("SO","Test");
      outputArticle(a,outMEDL,outHTML);
      outMEDL.close();
      outHTML.close();
      return;
    }
    int artNum=0;
    while(thisLine!=null){
      if(thisLine.length()>0){
        if(thisLine.charAt(0)!=' '){ //This is a new field
          if(thisLine.substring(0,4).equals("PMID")){ //This is a new article     
            if(thisValue!=null && thisValue.length()>0){
              art.add(thisField,thisValue);
              thisValue="";
            }
            artNum++;
            if(artNum%25==0) System.out.println(artNum);
            if(art!=null){ 
              outputArticle(art,outMEDL,outHTML);
            }
            art = new Article(tagger);
          }
          //All new fields will run this stuff
          if(thisValue.length()>0){art.add(thisField,thisValue);thisValue="";}            
          thisField=thisLine.substring(0,4).trim();
          thisValue=thisLine.substring(6,thisLine.length()).trim();          
        }
        else{ //This is not a new field
          thisLine=thisLine.trim();
          thisValue+=" "+thisLine;         
        }
      }
      thisLine = in.readLine();
    }    
    //This is stuff to process the last line/article/etc
    if(thisValue.length()>0){art.add(thisField,thisValue);thisValue="";}            
    if(art!=null){ 
      outputArticle(art,outMEDL,outHTML);      
    }
    //End of code to process last thing
  }
  
  /**
   * Tags the right fields and outputs this article in both MEDLINE and HTML format
   * @param art The article to process
   * @param outMEDL The MEDLINE file's BufferedWriter
   * @param outHTML the HTML file's BufferedWriter
   * */
  private static void outputArticle(Article art, BufferedWriter outMEDL, BufferedWriter outHTML) throws IOException{
      art.tag("AB");
      art.tag("TI");
      outMEDL.write(art.toString(Article.MEDLINE)+"\n");
      outHTML.write(art.toString(Article.HTML)+"\n");  
      outMEDL.flush();
      outHTML.flush();
  }
  
}