/* * GooglePR.java * * Copyright (c) 1998-2004, The University of Sheffield. * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * * Google API and other sources subject to Google License. Please * see http://www.google.com/apis/ * * $Id: GooglePR.java 7451 2006-06-15 14:10:50Z ian_roberts $ */ package google; import gate.ProcessingResource; import gate.Resource; import gate.creole.*; import gate.gui.MainFrame; import gate.corpora.*; import gate.util.*; import gate.*; import java.util.*; import com.google.soap.search.*; public class GooglePR extends AbstractLanguageAnalyser implements ProcessingResource { private String query = null; private int limit = -1; private String key = null; private Corpus google = null; private Boolean corpusAppendMode; private final static boolean DEBUG = false; private ArrayList pagesToExclude; /** Constructor of the class*/ public GooglePR() { } /** Initialise this resource, and return it. */ public Resource init() throws ResourceInstantiationException { return super.init(); } /** * Reinitialises the processing resource. After calling this method the * resource should be in the state it is after calling init. * If the resource depends on external resources (such as rules files) then * the resource will re-read those resources. If the data used to create * the resource has changed since the resource has been created then the * resource will change too after calling reInit(). */ public void reInit() throws ResourceInstantiationException { init(); } /** * This method runs the coreferencer. It assumes that all the needed parameters * are set. If they are not, an exception will be fired. */ public void execute() throws ExecutionException { if (google == null) { throw new ExecutionException( "Corpus to store results in is not provided"); } if (query == null) { throw new ExecutionException("Query is not initialized"); } if (limit == -1) { throw new ExecutionException("Limit is not initialized"); } if (key == null) { throw new ExecutionException("Key is not initialized"); } if(!corpusAppendMode.booleanValue()) { while(google.size() > 0) { Resource resource = (Resource) google.get(0); google.remove(0); Factory.deleteResource(resource); } } // Create a Google Search object, set our authorization key GoogleSearch search = new GoogleSearch(); search.setKey(key); //do search search.setQueryString(query); //set limit //search.setMaxResults(limit); int index = 0; try { while (index < limit) { search.setStartResult(index); if (limit - index < 10) { search.setMaxResults(limit - index); } //run search GoogleSearchResult results = search.doSearch(); //An array that holds the list of result elements GoogleSearchResultElement[] rs = new GoogleSearchResultElement[limit]; rs = results.getResultElements(); if (rs != null) { for (int i = 0; i < rs.length; i++) { GoogleSearchResultElement rElement = rs[i]; if(DEBUG) Err.println(index + i + ") " + rElement.getURL()); String urlString = rElement.getURL(); if(pagesToExclude != null && pagesToExclude.contains(urlString)) { continue; } String docName = rElement.getURL() + "_" + Gate.genSym(); FeatureMap params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_URL_PARAMETER_NAME, rElement.getURL()); try { Document doc = (Document) Factory.createResource( DocumentImpl.class.getName(), params, null, docName); google.add(doc); } catch (ResourceInstantiationException e) { String nl = Strings.getNl(); Err.prln("WARNING: could not intantiate document :"+e.getMessage()); /*Err.prln("WARNING: could not intantiate document" + nl + " Document name was: " + docName + nl + " Exception was: " + e + nl + nl); */ } } } index += 10; } } catch (Exception gsf) { Err.println("Google Search Fault: " + gsf.getMessage()); //gsf.printStackTrace(); } } public void setQuery(String query) { this.query = query; } public String getQuery() { return this.query; } public void setKey(String key) { this.key = key; } public String getKey() { return this.key; } public void setLimit(Integer limit) { this.limit = limit.intValue(); } public Integer getLimit() { return new Integer(this.limit); } public Corpus getCorpus() { return google; } public void setCorpus(Corpus corpus) { this.google = corpus; } public void setCorpusAppendMode(Boolean appendMode) { this.corpusAppendMode = appendMode; } public Boolean getCorpusAppendMode() { return this.corpusAppendMode; } public void setPagesToExclude(List pagesToExclude) { this.pagesToExclude = new ArrayList(); // pagesToExclude is an optional param. // If it is null, the list should be empty. if (pagesToExclude == null) return ; Iterator iterator = pagesToExclude.iterator(); while(iterator.hasNext()) { String page = (String) iterator.next(); page = page.toLowerCase(); this.pagesToExclude.add(page); } } public List getPagesToExclude() { return this.pagesToExclude; } }