Log in Help
Print
Homereleasesgate-5.1-beta2-build3402-ALLpluginsWeb_Search_Yahoosrcgateyahoo 〉 YahooPR.java
 
/*
 *  YahooPR.java
 *
 *  Copyright (c) 1998-2004, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  $Id: YahooPR.java 8140 2007-02-01 17:54:25Z ian_roberts $  
 */
package gate.yahoo;

import gate.*;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.*;
import gate.corpora.*;
import gate.*;
import java.util.*;
import com.yahoo.search.*;

/**
 * Given a query and other required parameters, this PR searches on YAHOO to retrieve the
 * top n documents that match the query. Each found document is converted in to a GATE document and
 * populated in the provided corpus.
 * 
 * @author niraj
 */
public class YahooPR extends AbstractLanguageAnalyser implements
                                                     ProcessingResource {
  /**
   * Search Query
   */
  private String query = null;

  /**
   * No of documents to retrieve
   */
  private int limit = -1;

  /**
   * One has to obtain an ID to use Yahoo Search Engine. Visit www.yahoo.com for
   * more information.
   */
  private String applicationID = null;

  /**
   * If set to false, the corpus is emptied prior to adding found documents by
   * yahoo.
   */
  private Boolean corpusAppendMode;

  /**
   * One can specify a list of pages which need to be excluded from the search.
   */
  private ArrayList pagesToExclude;

  /**
   * One can also specify the types of files to search for
   */
  private String fileFormat = YahooSearch.ALL;

  /**
   * Instance of a YahooSearch - it is a real logic to search on yahoo using the
   * yahoo library.
   */
  private YahooSearch searcher;

  /** Constructor of the class */
  public YahooPR() {
  }

  /** Initialise this resource, and return it. */
  public Resource init() throws ResourceInstantiationException {
    if(applicationID == null) { throw new ResourceInstantiationException(
            "ApplicationID not provided"); }
    searcher = new YahooSearch(applicationID);
    return super.init();
  }

  /**
   * Reinitialises the processing resource. After calling this method the
   * resource should be in the state it is after calling init. If the resource
   * depends on external resources (such as rules files) then the resource will
   * re-read those resources. If the data used to create the resource has
   * changed since the resource has been created then the resource will change
   * too after calling reInit().
   */
  public void reInit() throws ResourceInstantiationException {
    init();
  }

  /**
   * This method runs the coreferencer. It assumes that all the needed
   * parameters are set. If they are not, an exception will be fired.
   */
  public void execute() throws ExecutionException {
    if(corpus == null) { throw new ExecutionException(
            "Corpus to store results in is not provided"); }
    if(query == null) { throw new ExecutionException("Query is not initialized"); }
    if(limit <= 0) { throw new ExecutionException("Limit is not initialized"); }
    if(!corpusAppendMode.booleanValue()) {
      while(corpus.size() > 0) {
        Resource resource = (Resource)corpus.get(0);
        corpus.remove(0);
        Factory.deleteResource(resource);
      }
    }
    try {
      searcher.setFormat(fileFormat);
      WebSearchResult[] results = searcher.search(query, this.limit);
      if(results == null) { return; }
      // for each result we need to create a gate document
      // and add it into the provided corpus
      for(int i = 0; i < results.length; i++) {
        String urlString = results[i].getUrl();
        if(pagesToExclude != null && pagesToExclude.contains(urlString)) {
          continue;
        }
        try {
          String docName = urlString + "_" + Gate.genSym();
          FeatureMap params = Factory.newFeatureMap();
          params.put(Document.DOCUMENT_URL_PARAMETER_NAME, urlString);
          Document doc = (Document)Factory.createResource(DocumentImpl.class
                  .getName(), params, null, docName);
          corpus.add(doc);
          if(corpus.getLRPersistenceId() != null) {
            // persistent corpus -> unload the document
            corpus.unloadDocument(doc);
            Factory.deleteResource(doc);
          }
        } catch(Exception e) {
          System.out.println("Ignoring : " + urlString);
        }
      }
    } catch(Exception e) {
      throw new ExecutionException(e);
    }
  }

  public void setQuery(String query) {
    this.query = query;
  }

  public String getQuery() {
    return this.query;
  }

  /**
   * One has to obtain an ID to use Yahoo Search Engine. Visit www.yahoo.com for
   * more information.
   */
  public void setApplicationID(String key) {
    this.applicationID = key;
  }

  /**
   * Returns the set application ID
   * 
   * @return
   */
  public String getApplicationID() {
    return this.applicationID;
  }

  /**
   * Number of documents to search for
   * 
   * @param limit
   */
  public void setLimit(Integer limit) {
    this.limit = limit.intValue();
  }

  /**
   * Number of documents to search for
   * 
   * @return
   */
  public Integer getLimit() {
    return new Integer(this.limit);
  }

  /**
   * The corpus in which all the found documents are populated
   */
  public Corpus getCorpus() {
    return corpus;
  }

  /**
   * The corpus in which all the found documents are populated
   */
  public void setCorpus(Corpus corpus) {
    this.corpus = corpus;
  }

  /**
   * If set to false, the corpus is emptied prior to adding found documents by
   * yahoo.
   */
  public void setCorpusAppendMode(Boolean appendMode) {
    this.corpusAppendMode = appendMode;
  }

  /**
   * If set to false, the corpus is emptied prior to adding found documents by
   * yahoo.
   */
  public Boolean getCorpusAppendMode() {
    return this.corpusAppendMode;
  }

  /**
   * List of pages which need to be excluded from the search.
   */
  public void setPagesToExclude(List pagesToExclude) {
    this.pagesToExclude = new ArrayList();
    // pagesToExclude is an optional param.
    // If it is null, the list should be empty.
    if(pagesToExclude == null) return;
    Iterator iterator = pagesToExclude.iterator();
    while(iterator.hasNext()) {
      String page = (String)iterator.next();
      page = page.toLowerCase();
      this.pagesToExclude.add(page);
    }
  }

  /**
   * A list of pages which need to be excluded from the search.
   */
  public List getPagesToExclude() {
    return this.pagesToExclude;
  }

  /**
   * Supported File Formats: "all", "html", "msword", "pdf", "ppt", "rss",
   * "txt", "xls"
   */
  public String getFileFormat() {
    return fileFormat;
  }

  /**
   * Supported File Formats: "all", "html", "msword", "pdf", "ppt", "rss",
   * "txt", "xls"
   */
  public void setFileFormat(String fileFormat) {
    this.fileFormat = fileFormat;
  }
}