Log in Help
Print
HomegatepluginsTagger_PennBiosrceduupenncistaggersfrontend 〉 Article.java
 
/* Copyright (C) 2004 Univ. of Pennsylvania
    This software is provided under the terms of the Common Public License,
    version 1.0, as published by http://www.opensource.org.  For further
    information, see the file `LICENSE' included with this distribution. */

package edu.upenn.cis.taggers.frontend;
import java.util.*;
import java.io.*;
import edu.upenn.cis.taggers.*;

/**
 * An article is a collection of field names and values, in the order they were received.
 * Methods exist to tag fields and output in MEDLINE and HTML formats.
 * @author Ryan McDonald <a href="mailto:ryantm@cis.upenn.edu">ryantm@cis.upenn.edu</a>
 * @author Kevin Lerman <a href="mailto:klerman@seas.upenn.edu">Kevin Lerman</a>
 * */
public class Article {
  public final static boolean MEDLINE=true;
  public final static boolean HTML=false;  
  private Tagger tagger;
  private ArrayList fields=new ArrayList(), values = new ArrayList();  
  private ArrayList oldFields = new ArrayList(), oldValues = new ArrayList();
  
  public Article(Tagger t){
    tagger = t;
  }
  
  /**
   * Add a field-value pair.  Duplicates allowed.
   * @param f The field
   * @param v The value
   * */
  public void add(String f, String v){
    oldFields.add(f);
    oldValues.add(v);    
    fields.add(f);
    values.add(v);
  }
  
  /**
   * Tag the value associated with that field.
   * Uses find() to get the value, so it only tags the first entry for that field
   * if multiple exist.  This is fine, so long as articles only have one title
   * field and one abstract field.  The old string will be retained for output
   * to MEDLINE files.
   * @param f The field whose value will be tagged
   * */
  public void tag(String f) throws IOException{
    String old = getValue(f);
    if(old!=null){
      values.set(find(f),tagger.tag(old));
    }
  }
  
  /**
   * Returns this article in either MEDLINE or HTML format
   * MEDLINE will use all the original strings, with additional tags
   * (specified by the tagger) for the data.  HTML will use XML and
   * additional tags (usually font colors, as specified by the individual tagger)
   * to show the tagger's work.  Some formatting information
   * is drawn from the tagger object.
   * @param b Article.MEDLINE or Article.HTML depending which output you want
   * @return This article, as either a MEDLINE entry or HTML
   * */
  public String toString(boolean b){
    String toReturn="";        
    if(b==MEDLINE){      
      String toProcess="";
      for(int x=0;x<values.size();x++){
        String thisLine="";
        thisLine=getField(x);
        while(thisLine.length()<4) thisLine+=' ';
        //Get tagging info from title and abstract to append them to the original MEDLINE entry
        if(getField(x).equals("TI") || getField(x).equals("AB")){ 
          toProcess+=getValue(x);          
          thisLine+="- "+getForMedline(getField(x)).trim()+'\n';
        }
        else{
          thisLine+="- "+getValue(x).trim()+'\n';
        }
        toReturn+=thisLine;                
      }
      toReturn+=xmlToMedline(toProcess);
      if(xmlToMedline(toProcess).length()>0) toReturn+='\n';
    }
    else{
      toReturn+="<HR>";
      String[] tags = {"PMID","TI","AB","AU","DP","PT","SO"};
      String[] label = {"PMID: ","","","Author: ","Published: ","Type: ","Reference: "};
      boolean[] bold = {false,true,false,false,false,false,false};
      boolean[] process = {false,true,true,false,false,false,false};
      //Loop through the field-value pairs and tag anybody that should be tagged
      for(int x=0;x<tags.length;x++){
        String val = getValue(tags[x]);
        if(val!=null){
          if(process[x]) val = xmlToHTML(val);
          if(bold[x]) toReturn+="<b>";
          toReturn+=label[x]+val;
          if(bold[x]) toReturn+="</b>";
          toReturn+="<BR>";
        }
      }      
    }
    return toReturn;
  }
  
  /**
   * Return the value associated with a field, or null if there isn't one
   * If it's been tagged or modified in any way, gets the modified one.
   * @param f The field whose value will be retrieved
   * @return The first value associated with that field
   * */
  public String getValue(String f){
    if(find(f)!=-1)
      return (String)(values.get(find(f)));
    else return null;
  }
  
  /**
   * Return the n'th value, or null if there isn't one
   * @param n The number value to return
   * @return The n'th value, or null if there isn't one
   * */
  private String getValue(int n){
    if(n<values.size()) return (String)(values.get(n));
    else return null;
  }
  
  /**
   * Return the n'th field, or null if there isn't one
   * @param n The number field to return
   * @return The n'th field, or null if there isn't one
   * */
  private String getField(int f){
    if(f<fields.size()) return (String)(fields.get(f));
    else return null;
  }
  
  /**
   * Find the (first) index of a field, or return -1 if it's not there
   * @param f The field
   * @return The first index of that field, or -1 if there isn't one
   * */
  private int find(String f){
    for(int x=0;x<fields.size();x++)
      if(((String)(fields.get(x))).equals(f)) return x;
    return -1;
  }
  
  /**
   * Search the input string for XML tags, and return them in MEDLINE format
   * @param in The string to search
   * @return A String consisting of the MEDLINE tags generated by the Tagger
   * */
  private String xmlToMedline(String in){    
    String[] open = tagger.xmlTags();
    String[] close = tagger.xmlTags();    
    String[] tags = tagger.medlineTags();
    for(int x=0;x<close.length;x++){
      close[x]="</"+open[x].substring(1,open[x].length());
      while (tags[x].length()<4) 
        tags[x]+=' ';
      tags[x]=tags[x]+"- ";
    }
    ArrayList[] list = {new ArrayList(), new ArrayList(), new ArrayList(), new ArrayList()};
    int found=0;
    //Do a search-and-destroy sequence until we run out of things to destroy
    do{
      found=0;
      for(int x=0;x<open.length;x++){
        //Look for the open and close tags, then extract the stuff between them.  Destroy the set, then increment found
        int start = in.indexOf(open[x]);
        int end = in.indexOf(close[x]);
        if(start!=-1 && end!=-1){
          found++;
          String value = (in.substring(start+open[x].length(),end)).trim().toLowerCase();
          boolean canI=true;
          for(int y=0;y<list[x].size() && canI;y++){
            if(((String)(list[x].get(y))).equals(value)) canI=false;
          }
          if(canI) list[x].add(value);
          found++;          
          if(end+close[x].length()<in.length())
            in = in.substring(0,start)+in.substring(end+close[x].length(),in.length());
          else
            in = in.substring(0,start);
        }
      }
    } while(found>0);
    String toReturn="";
    for(int x=0;x<list.length;x++){
      for(int n=0;n<list[x].size();n++){
        toReturn+=tags[x]+((String)(list[x].get(n)))+'\n';
      }
    }
    if(toReturn.length()>0)
      return toReturn.substring(0,toReturn.length()-1);
    else return "";
  }
  
  /**
   * Insert <font> tags as specified by the tagger object into the input string
   * wherever XML tags are found.  Return the new string.
   * @param in The input string
   * @return The input string, with HTML tags inserted as dictated by the Tagger
   * */
  private String xmlToHTML(String in){
    String[] open = tagger.xmlTags();
    String[] close = tagger.xmlTags();
    String[] openFont = tagger.htmlOpenTags();
    String[] closeFont = tagger.htmlCloseTags();
    for(int x=0;x<close.length;x++){
      close[x]="</"+open[x].substring(1,open[x].length());    
      closeFont[x]=closeFont[x]+' ';
    }
    String toReturn="";
    boolean found=false;
    do{
      found=false;
      int o = in.length()+1;
      int t = -1;      
      for(int x=0;x<open.length;x++){
        if(in.indexOf(open[x])!=-1 && in.indexOf(open[x])<o){
          o = in.indexOf(open[x]);
          t = x;
          found=true;
        }
      }
      if(found){
        //Now o is where the tag opens, and t is the type of tag we're looking for
        int c = in.indexOf(close[t]);
        String value = (in.substring(o+open[t].length(),c));
        if(o!=0) toReturn+=in.substring(0,o);
        toReturn+=open[t]+openFont[t]+value+closeFont[t]+close[t];
        if(c+close[t].length()<in.length())
          in = in.substring(c+close[t].length(),in.length());
        else
          in = "";
      }
    }while(found);
    //If there's anything leftover, transfer it.
    return toReturn+in;      
  }
  
  /**
   * Return the value associated with a field.  If the value's since been modified,
   * this will return the old version.  Returns null if not found.
   * @param f The field to search for.
   * @return The unmodified value associated with that field / null if there isn't one
   * */
  private String getForMedline(String f){
    for(int x=0;x<oldFields.size();x++){      
      if(((String)(oldFields.get(x))).equals(f)){return (String)(oldValues.get(x));}
    }    
    return null;
  }  
}