Log in Help
Print
HomegatepluginsTagger_PennBiosrceduupenncistaggersgene 〉 GeneTagger.java
 
/* Copyright (C) 2004 Univ. of Pennsylvania
    This software is provided under the terms of the Common Public License,
    version 1.0, as published by http://www.opensource.org.  For further
    information, see the file `LICENSE' included with this distribution. */

package edu.upenn.cis.taggers.gene;

import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import edu.umass.cs.mallet.base.fst.CRF4;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.iterator.LineGroupIterator;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.upenn.cis.taggers.LoadModelException;
import edu.upenn.cis.taggers.Model;
import edu.upenn.cis.taggers.TagList;
import edu.upenn.cis.taggers.Tagger;
import edu.upenn.cis.tokenizers.BioTokenizer;

/**
 * Tags genes within a body of text
 * @author Ryan McDonald <a href="mailto:ryantm@cis.upenn.edu">ryantm@cis.upenn.edu</a> 
 * @author Kevin Lerman <a href="mailto:klerman@seas.upenn.edu">klerman@seas.upenn.edu</a>
 */
public class GeneTagger implements Tagger
{
 int numEvaluations = 0;
 static int iterationsBetweenEvals = 16;
 TagList tl;

 private static String CAPS = "[A-Z]";
 private static String LOW = "[a-z]";
 private static String CAPSNUM = "[A-Z0-9]";
 private static String ALPHA = "[A-Za-z]";
 private static String ALPHANUM = "[A-Za-z0-9]";
 private static String PUNT = "[,\\.;:?!]";
 private static String QUOTE = "[\"`']";
 private static String SEQ = "[atgcu]+";
 private static String BADSUFFIX = ".*ole|.*ane|.*ate|.*ide|.*ine|.*ite|.*ol|.*ose|.*cooh|.*ar|.*ic|.*al|.*ive|.*ly|.*yl|.*ing|.*ry|.*ian|.*ent|.*ward|.*fold|.*ene|.*ory|.*ized|.*ible|.*ize|.*izes|.*ed|.*tion|.*ity|.*ure|.*ence";
 private static String GOODSUFFIX = ".*gene|.*like|.*ase|homeo.*";
 
 private CRF4 crf = null;
 private InstanceList instanceData = null;
 private SerialPipes p = null;
 private BioTokenizer tokenizer = null;
 
  /**
  * Construct a new gene tagger -- read in the model and whatnot
  * @param in The CRF4 model to use
  * */
 public GeneTagger(String model) throws LoadModelException {
   tokenizer = new BioTokenizer();
   System.out.println("Initializing Gene Tagger...");
   tl = new TagList();
   crf = Model.loadAndRetrieveModel(model);       
   System.out.println("Gene Tagger Initialized.");
 }
 
 public GeneTagger(URL modelURL) throws LoadModelException, IOException {
   System.out.println("Initializing Gene Tagger...");
   tl = new TagList();
   crf = Model.loadAndRetrieveModel(new GZIPInputStream(modelURL.openStream()));       
   System.out.println("Gene Tagger Initialized.");
 }
 
 public TagList tag(String [] tokens) throws IOException {
   p = (SerialPipes)crf.getInputPipe();
   String toPass="";
   for(int x=0;x<tokens.length;x++){
       String[] w = tokens[x].split("\n");
       for(int y = 0;y<w.length;y++){
           if(w[y].length()>0)
               toPass+=w[y]+"\tO\n";
           else
               toPass+=w[y]+'\n';
       }
   } 
   
   instanceData = new InstanceList (p);
   instanceData.add (new LineGroupIterator (new StringReader (toPass), Pattern.compile("^$"), true));
   GeneSegmentationOutput gso = new GeneSegmentationOutput();
   
   //Process string to find genes
   tl = gso.tag(toPass,crf,instanceData);
   
   return tl;   
 }
 
 /**
  * Returns XML-tagged data
  * @param in The String to tag
  * @return The tagged String
  * */
 //TODO: THIS RETURNS A STRING -  really we have a TagList! 
 public String tag (String in) throws IOException
 {
     p = (SerialPipes)crf.getInputPipe();  
     in = tokenizer.tokenize(in);
     //Program will crash if we ever have two adjacent spaces, so let's prevent that
     in = in.replaceAll("\\s+"," ");
     
     //Work data into a format usable by the model
     String[] tokens = in.split(" ");
     
     
     String toPass="";
     for(int x=0;x<tokens.length;x++){
         String[] w = tokens[x].split("\n");
         for(int y = 0;y<w.length;y++){
             if(w[y].length()>0)
                 toPass+=w[y]+"\tO\n";
             else
                 toPass+=w[y]+'\n';
         }
     } 
     
     instanceData = new InstanceList (p);
     instanceData.add (new LineGroupIterator (new StringReader (toPass), Pattern.compile("^$"), true));
     GeneSegmentationOutput gso = new GeneSegmentationOutput();
     
     //Process string to find genes
     tl = gso.tag(toPass,crf,instanceData);
     
     return tl.toXML(tokens);   
 }
 
 public TagList getTagList() {
     return tl;
 }
 
 public String htmlHeader(){
   return "<HTML><BODY>Normal text<BR><font color=RED>Genes</font><BR>";
 }
 
 /**
  * See {@link Tagger} for details
  * */
 public String[] xmlTags(){
   String[] t = {"<gene>"};
   return t;
 }
 
 /**
  * See {@link Tagger} for details
  * */
 public String[] medlineTags(){
   String[] t = {"GENE"};
   return t;
 }
 
 /**
  * See {@link Tagger} for details
  * */
 public String[] htmlOpenTags(){
   String[] t = {"<B><font color=RED>"};
   return t;
 }
 
 /**
  * See {@link Tagger} for details
  * */
 public String[] htmlCloseTags(){
   String[] t = {"</b></font>"};
   return t;
 }
}