 *  CrawlPR.java
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
package crawl;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.HiddenCreoleParameter;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.Err;
import gate.util.Strings;

import java.net.URL;
import java.util.List;

import websphinx.Crawler;
import websphinx.DownloadParameters;

@CreoleResource(name = "Crawler PR",
        comment = "GATE implementation of the Websphinx crawling API",
        helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:crawler")
public class CrawlPR 
  extends AbstractLanguageAnalyser 
  implements ProcessingResource {

  private static final long serialVersionUID = 3904269406671650905L;
  private static final String __SVNID = "$Id: CrawlPR.java 17699 2014-03-19 09:11:55Z markagreenwood $";

  private String root = null;
  private int depth = -1;
  private Corpus outputCorpus = null;
  private Boolean dfs;
  private Boolean caseSensitiveKeywords;
  private SphinxWrapper crawler;
  private DomainMode domain = null;
  private Corpus source = null;
  private int maxFetch = -1;
  private int maxKeep  = -1;
  private Boolean convertXmlTypes;
  private String userAgent; // for spoofing
  private int maxPageSize;  // in kB
  // ignore keyword requirement if null or empty
  private List<String> keywords = null;

  /** Constructor of the class */
  public CrawlPR() {


  /** Initialise this resource, and return it. */
  public Resource init() throws ResourceInstantiationException {
    return super.init();

   * Reinitialises the processing resource. After calling this method the
   * resource should be in the state it is after calling init. If the resource
   * depends on external resources (such as rules files) then the resource will
   * re-read those resources. If the data used to create the resource has
   * changed since the resource has been created then the resource will change
   * too after calling reInit().
  public void reInit() throws ResourceInstantiationException {

   * Override the default behaviour by interrupting the SphinxWrapper itself. 
  public void interrupt() {
    this.interrupted = true;
    if (crawler != null) {
   * This method runs the crawler. It assumes that all the needed
   * parameters are set. If they are not, an exception will be fired.
  public void execute() throws ExecutionException {
    this.interrupted = false;
    DownloadParameters downloadParameters = new DownloadParameters();
    downloadParameters = downloadParameters.changeUserAgent(userAgent);
    downloadParameters = downloadParameters.changeMaxPageSize(maxPageSize);
    crawler = new SphinxWrapper();
    crawler.setKeywords(keywords, caseSensitiveKeywords);
    if(outputCorpus == null) { 
        throw new ExecutionException("Output Corpus cannot be null");

    if ( (root == null) && (source == null) ) {
        throw new ExecutionException("Either root or source must be initialized");
    if(depth < 0) {
        throw new ExecutionException("Limit is not initialized");
    if(dfs == null) {
        throw new ExecutionException("dfs is not initialized");
    if(domain == null) {
      throw new ExecutionException("domain type is not initialized.. Set to either SERVER/SUBTREE/WEB");

    try {
      if(domain.equals(DomainMode.SUBTREE)) {
      else if(domain.equals(DomainMode.SERVER)) {
      else {


      if (root != null && (root.length() > 0)) {

      if (source != null) {
        for(int i = 0; i < source.size(); i++) {
          boolean docWasLoaded = source.isDocumentLoaded(i);
          Document doc = (Document) source.get(i);
          URL url = doc.getSourceUrl();
          if (url != null) {
          else {
            System.out.println("Skipping source document:" + doc.getName());
          if(! docWasLoaded) {
      if (this.interrupted) {
        throw new ExecutionInterruptedException();
    catch(Exception e) {
      String nl = Strings.getNl();
      Err.prln("  Exception was: " + e + nl + nl);

  @CreoleParameter(comment = "The starting URL for the crawl")
  public void setRoot(String root) {
    this.root = root;

  public String getRoot() {
    return this.root;

  @CreoleParameter(comment = "The depth to which the crawl must proceed",
    defaultValue = "3")
  public void setDepth(Integer limit) {
    this.depth = limit.intValue();

  public Integer getDepth() {
    return new Integer(this.depth);

  @CreoleParameter(comment = "true for depth-first search; false for breadth-first search",
          defaultValue = "true")
  public void setDfs(Boolean dfs) {
    this.dfs = dfs;

  public Boolean getDfs() {
    return this.dfs;
  @CreoleParameter(comment = "HTTP User Agent to spoof (leave blank for default)",
          defaultValue = "")
  public void setUserAgent(String ua) {
    this.userAgent = ua;
  public String getUserAgent() {
    return this.userAgent;

  @CreoleParameter(comment = "max page size in kB (0 for no limit)", defaultValue = "100")
  public void setMaxPageSize(Integer mps) {
    this.maxPageSize = mps.intValue();
  public Integer getMaxPageSize() {
    return Integer.valueOf(this.maxPageSize);

  @CreoleParameter(comment = "The domain restriction for the crawl",
          defaultValue = "SUBTREE")
  public void setDomain(DomainMode domain) {
    this.domain = domain;

  public DomainMode getDomain() {
    return this.domain;

  @CreoleParameter(comment = "corpus whose gate.SourceURL document features will be used to seed the crawl")
  public void setSource(Corpus source) {
    this.source = source;

  public Corpus getSource() {
    return this.source;

  @CreoleParameter(comment = "Stop the crawl after fetching this many pages (-1 to ignore)",
          defaultValue = "-1")
  public void setStopAfter(Integer max) {
    this.maxFetch = max.intValue();

  // stopAfter was maxFetch in AF's first revision
  public Integer getStopAfter() {
    return Integer.valueOf(this.maxFetch);
  @CreoleParameter(comment = "Stop the crawl after saving this many pages (-1 to ignore)",
          defaultValue = "-1")
  public void setMax(Integer max) {
    this.maxKeep = max.intValue();
  // max was maxKeep in AF's first revision;
  public Integer getMax() {
    return Integer.valueOf(this.maxKeep);

  @CreoleParameter(comment = "Store the crawl output here")
  public void setOutputCorpus(Corpus outputCorpus) {
    this.outputCorpus = outputCorpus;

  public Corpus getOutputCorpus() {
    return outputCorpus;
  @CreoleParameter(comment = "Pages that don't match at least one keyword will be dropped; leave empty to keep all pages")
  public void setKeywords(List<String> keywords) {
    this.keywords = keywords;
  public List<String> getKeywords() {
    return this.keywords;

  @CreoleParameter(comment = "Are keywords case-sensitive?",
          defaultValue = "true")
  public void setKeywordsCaseSensitive(Boolean kcs) {
    this.caseSensitiveKeywords = kcs;
  public Boolean getKeywordsCaseSensitive() {
    return this.caseSensitiveKeywords;
  @CreoleParameter(comment = "Convert other XML mime types to text/xml",
          defaultValue = "true")
  public void setConvertXmlTypes(Boolean convert) {
    this.convertXmlTypes = convert;
  public Boolean getConvertXmlTypes() {
    return this.convertXmlTypes;
  public void setDocument(Document x) {
    // NOTHING

  public void setCorpus(Corpus x) {
    // NOTHING
