/*
* CrawlPR.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*/
package crawl;
import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.HiddenCreoleParameter;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.Err;
import gate.util.Strings;
import java.net.URL;
import java.util.List;
import websphinx.Crawler;
import websphinx.DownloadParameters;
@CreoleResource(name = "Crawler PR",
comment = "GATE implementation of the Websphinx crawling API",
helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:crawler")
public class CrawlPR
extends AbstractLanguageAnalyser
implements ProcessingResource {
private static final long serialVersionUID = 3904269406671650905L;
@SuppressWarnings("unused")
private static final String __SVNID = "$Id: CrawlPR.java 17699 2014-03-19 09:11:55Z markagreenwood $";
private String root = null;
private int depth = -1;
private Corpus outputCorpus = null;
private Boolean dfs;
private Boolean caseSensitiveKeywords;
private SphinxWrapper crawler;
private DomainMode domain = null;
private Corpus source = null;
private int maxFetch = -1;
private int maxKeep = -1;
private Boolean convertXmlTypes;
private String userAgent; // for spoofing
private int maxPageSize; // in kB
// ignore keyword requirement if null or empty
private List<String> keywords = null;
/** Constructor of the class */
public CrawlPR() {
}
/** Initialise this resource, and return it. */
public Resource init() throws ResourceInstantiationException {
return super.init();
}
/**
* Reinitialises the processing resource. After calling this method the
* resource should be in the state it is after calling init. If the resource
* depends on external resources (such as rules files) then the resource will
* re-read those resources. If the data used to create the resource has
* changed since the resource has been created then the resource will change
* too after calling reInit().
*/
public void reInit() throws ResourceInstantiationException {
init();
}
/**
* Override the default behaviour by interrupting the SphinxWrapper itself.
*/
public void interrupt() {
this.interrupted = true;
if (crawler != null) {
crawler.interrupt();
}
}
/**
* This method runs the crawler. It assumes that all the needed
* parameters are set. If they are not, an exception will be fired.
*/
public void execute() throws ExecutionException {
this.interrupted = false;
DownloadParameters downloadParameters = new DownloadParameters();
downloadParameters = downloadParameters.changeUserAgent(userAgent);
downloadParameters = downloadParameters.changeMaxPageSize(maxPageSize);
crawler = new SphinxWrapper();
crawler.clear();
crawler.setDownloadParameters(downloadParameters);
crawler.setKeywords(keywords, caseSensitiveKeywords);
crawler.setConvertXmlTypes(convertXmlTypes);
crawler.resetCounter();
if(outputCorpus == null) {
throw new ExecutionException("Output Corpus cannot be null");
}
if ( (root == null) && (source == null) ) {
throw new ExecutionException("Either root or source must be initialized");
}
if(depth < 0) {
throw new ExecutionException("Limit is not initialized");
}
if(dfs == null) {
throw new ExecutionException("dfs is not initialized");
}
if(domain == null) {
throw new ExecutionException("domain type is not initialized.. Set to either SERVER/SUBTREE/WEB");
}
try {
crawler.setCorpus(outputCorpus);
crawler.setDepth(depth);
crawler.setDepthFirst(dfs.booleanValue());
if(domain.equals(DomainMode.SUBTREE)) {
crawler.setDomain(Crawler.SUBTREE);
}
else if(domain.equals(DomainMode.SERVER)) {
crawler.setDomain(Crawler.SERVER);
}
else {
crawler.setDomain(Crawler.WEB);
}
crawler.setMaxPages(maxFetch);
crawler.setMaxKeep(maxKeep);
if (root != null && (root.length() > 0)) {
crawler.addStartLink(root);
}
if (source != null) {
for(int i = 0; i < source.size(); i++) {
boolean docWasLoaded = source.isDocumentLoaded(i);
Document doc = (Document) source.get(i);
URL url = doc.getSourceUrl();
if (url != null) {
crawler.addStartLink(url);
}
else {
System.out.println("Skipping source document:" + doc.getName());
}
if(! docWasLoaded) {
source.unloadDocument(doc);
Factory.deleteResource(doc);
}
}
}
crawler.start();
if (this.interrupted) {
throw new ExecutionInterruptedException();
}
}
catch(Exception e) {
String nl = Strings.getNl();
Err.prln(" Exception was: " + e + nl + nl);
e.printStackTrace();
}
}
/* CREOLE PARAMETERS */
@Optional
@RunTime
@CreoleParameter(comment = "The starting URL for the crawl")
public void setRoot(String root) {
this.root = root;
}
public String getRoot() {
return this.root;
}
@RunTime
@CreoleParameter(comment = "The depth to which the crawl must proceed",
defaultValue = "3")
public void setDepth(Integer limit) {
this.depth = limit.intValue();
}
public Integer getDepth() {
return new Integer(this.depth);
}
@RunTime
@CreoleParameter(comment = "true for depth-first search; false for breadth-first search",
defaultValue = "true")
public void setDfs(Boolean dfs) {
this.dfs = dfs;
}
public Boolean getDfs() {
return this.dfs;
}
@Optional
@RunTime
@CreoleParameter(comment = "HTTP User Agent to spoof (leave blank for default)",
defaultValue = "")
public void setUserAgent(String ua) {
this.userAgent = ua;
}
public String getUserAgent() {
return this.userAgent;
}
@Optional
@RunTime
@CreoleParameter(comment = "max page size in kB (0 for no limit)", defaultValue = "100")
public void setMaxPageSize(Integer mps) {
this.maxPageSize = mps.intValue();
}
public Integer getMaxPageSize() {
return Integer.valueOf(this.maxPageSize);
}
@RunTime
@CreoleParameter(comment = "The domain restriction for the crawl",
defaultValue = "SUBTREE")
public void setDomain(DomainMode domain) {
this.domain = domain;
}
public DomainMode getDomain() {
return this.domain;
}
@RunTime
@Optional
@CreoleParameter(comment = "corpus whose gate.SourceURL document features will be used to seed the crawl")
public void setSource(Corpus source) {
this.source = source;
}
public Corpus getSource() {
return this.source;
}
@RunTime
@Optional
@CreoleParameter(comment = "Stop the crawl after fetching this many pages (-1 to ignore)",
defaultValue = "-1")
public void setStopAfter(Integer max) {
this.maxFetch = max.intValue();
}
// stopAfter was maxFetch in AF's first revision
public Integer getStopAfter() {
return Integer.valueOf(this.maxFetch);
}
@RunTime
@Optional
@CreoleParameter(comment = "Stop the crawl after saving this many pages (-1 to ignore)",
defaultValue = "-1")
public void setMax(Integer max) {
this.maxKeep = max.intValue();
}
// max was maxKeep in AF's first revision;
public Integer getMax() {
return Integer.valueOf(this.maxKeep);
}
@RunTime
@CreoleParameter(comment = "Store the crawl output here")
public void setOutputCorpus(Corpus outputCorpus) {
this.outputCorpus = outputCorpus;
}
public Corpus getOutputCorpus() {
return outputCorpus;
}
@Optional
@RunTime
@CreoleParameter(comment = "Pages that don't match at least one keyword will be dropped; leave empty to keep all pages")
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
public List<String> getKeywords() {
return this.keywords;
}
@RunTime
@CreoleParameter(comment = "Are keywords case-sensitive?",
defaultValue = "true")
public void setKeywordsCaseSensitive(Boolean kcs) {
this.caseSensitiveKeywords = kcs;
}
public Boolean getKeywordsCaseSensitive() {
return this.caseSensitiveKeywords;
}
@RunTime
@CreoleParameter(comment = "Convert other XML mime types to text/xml",
defaultValue = "true")
public void setConvertXmlTypes(Boolean convert) {
this.convertXmlTypes = convert;
}
public Boolean getConvertXmlTypes() {
return this.convertXmlTypes;
}
@HiddenCreoleParameter
public void setDocument(Document x) {
// NOTHING
}
@HiddenCreoleParameter
public void setCorpus(Corpus x) {
// NOTHING
}
}