/* * YahooPR.java * * Copyright (c) 1998-2004, The University of Sheffield. * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * $Id: YahooPR.java 8140 2007-02-01 17:54:25Z ian_roberts $ */ package gate.yahoo; import gate.*; import gate.ProcessingResource; import gate.Resource; import gate.creole.*; import gate.corpora.*; import gate.*; import java.util.*; import com.yahoo.search.*; /** * Given a query and other required parameters, this PR searches on YAHOO to retrieve the * top n documents that match the query. Each found document is converted in to a GATE document and * populated in the provided corpus. * * @author niraj */ public class YahooPR extends AbstractLanguageAnalyser implements ProcessingResource { /** * Search Query */ private String query = null; /** * No of documents to retrieve */ private int limit = -1; /** * One has to obtain an ID to use Yahoo Search Engine. Visit www.yahoo.com for * more information. */ private String applicationID = null; /** * If set to false, the corpus is emptied prior to adding found documents by * yahoo. */ private Boolean corpusAppendMode; /** * One can specify a list of pages which need to be excluded from the search. */ private ArrayList pagesToExclude; /** * One can also specify the types of files to search for */ private String fileFormat = YahooSearch.ALL; /** * Instance of a YahooSearch - it is a real logic to search on yahoo using the * yahoo library. */ private YahooSearch searcher; /** Constructor of the class */ public YahooPR() { } /** Initialise this resource, and return it. */ public Resource init() throws ResourceInstantiationException { if(applicationID == null) { throw new ResourceInstantiationException( "ApplicationID not provided"); } searcher = new YahooSearch(applicationID); return super.init(); } /** * Reinitialises the processing resource. After calling this method the * resource should be in the state it is after calling init. If the resource * depends on external resources (such as rules files) then the resource will * re-read those resources. If the data used to create the resource has * changed since the resource has been created then the resource will change * too after calling reInit(). */ public void reInit() throws ResourceInstantiationException { init(); } /** * This method runs the coreferencer. It assumes that all the needed * parameters are set. If they are not, an exception will be fired. */ public void execute() throws ExecutionException { if(corpus == null) { throw new ExecutionException( "Corpus to store results in is not provided"); } if(query == null) { throw new ExecutionException("Query is not initialized"); } if(limit <= 0) { throw new ExecutionException("Limit is not initialized"); } if(!corpusAppendMode.booleanValue()) { while(corpus.size() > 0) { Resource resource = (Resource)corpus.get(0); corpus.remove(0); Factory.deleteResource(resource); } } try { searcher.setFormat(fileFormat); WebSearchResult[] results = searcher.search(query, this.limit); if(results == null) { return; } // for each result we need to create a gate document // and add it into the provided corpus for(int i = 0; i < results.length; i++) { String urlString = results[i].getUrl(); if(pagesToExclude != null && pagesToExclude.contains(urlString)) { continue; } try { String docName = urlString + "_" + Gate.genSym(); FeatureMap params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_URL_PARAMETER_NAME, urlString); Document doc = (Document)Factory.createResource(DocumentImpl.class .getName(), params, null, docName); corpus.add(doc); if(corpus.getLRPersistenceId() != null) { // persistent corpus -> unload the document corpus.unloadDocument(doc); Factory.deleteResource(doc); } } catch(Exception e) { System.out.println("Ignoring : " + urlString); } } } catch(Exception e) { throw new ExecutionException(e); } } public void setQuery(String query) { this.query = query; } public String getQuery() { return this.query; } /** * One has to obtain an ID to use Yahoo Search Engine. Visit www.yahoo.com for * more information. */ public void setApplicationID(String key) { this.applicationID = key; } /** * Returns the set application ID * * @return */ public String getApplicationID() { return this.applicationID; } /** * Number of documents to search for * * @param limit */ public void setLimit(Integer limit) { this.limit = limit.intValue(); } /** * Number of documents to search for * * @return */ public Integer getLimit() { return new Integer(this.limit); } /** * The corpus in which all the found documents are populated */ public Corpus getCorpus() { return corpus; } /** * The corpus in which all the found documents are populated */ public void setCorpus(Corpus corpus) { this.corpus = corpus; } /** * If set to false, the corpus is emptied prior to adding found documents by * yahoo. */ public void setCorpusAppendMode(Boolean appendMode) { this.corpusAppendMode = appendMode; } /** * If set to false, the corpus is emptied prior to adding found documents by * yahoo. */ public Boolean getCorpusAppendMode() { return this.corpusAppendMode; } /** * List of pages which need to be excluded from the search. */ public void setPagesToExclude(List pagesToExclude) { this.pagesToExclude = new ArrayList(); // pagesToExclude is an optional param. // If it is null, the list should be empty. if(pagesToExclude == null) return; Iterator iterator = pagesToExclude.iterator(); while(iterator.hasNext()) { String page = (String)iterator.next(); page = page.toLowerCase(); this.pagesToExclude.add(page); } } /** * A list of pages which need to be excluded from the search. */ public List getPagesToExclude() { return this.pagesToExclude; } /** * Supported File Formats: "all", "html", "msword", "pdf", "ppt", "rss", * "txt", "xls" */ public String getFileFormat() { return fileFormat; } /** * Supported File Formats: "all", "html", "msword", "pdf", "ppt", "rss", * "txt", "xls" */ public void setFileFormat(String fileFormat) { this.fileFormat = fileFormat; } }