GATE.ac.uk - gate/plugins/Web_Crawler_Websphinx/src/crawl/CrawlPR.java

/*
 *  CrawlPR.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 */
package crawl;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.HiddenCreoleParameter;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.Err;
import gate.util.Strings;

import java.net.URL;
import java.util.List;

import websphinx.Crawler;
import websphinx.DownloadParameters;


@CreoleResource(name = "Crawler PR",
        comment = "GATE implementation of the Websphinx crawling API",
        helpURL = "http://gate.ac.uk/userguide/sec:misc-creole:crawler")
public class CrawlPR 
  extends AbstractLanguageAnalyser 
  implements ProcessingResource {

  private static final long serialVersionUID = 3904269406671650905L;
  @SuppressWarnings("unused")
  private static final String __SVNID = "$Id: CrawlPR.java 17699 2014-03-19 09:11:55Z markagreenwood $";

  private String root = null;
  private int depth = -1;
  private Corpus outputCorpus = null;
  private Boolean dfs;
  private Boolean caseSensitiveKeywords;
  private SphinxWrapper crawler;
  private DomainMode domain = null;
  private Corpus source = null;
  private int maxFetch = -1;
  private int maxKeep  = -1;
  private Boolean convertXmlTypes;
  private String userAgent; // for spoofing
  private int maxPageSize;  // in kB
  
  // ignore keyword requirement if null or empty
  private List<String> keywords = null;

  /** Constructor of the class */
  public CrawlPR() {

  }

  /** Initialise this resource, and return it. */
  public Resource init() throws ResourceInstantiationException {
    return super.init();
  }

  /**
   * Reinitialises the processing resource. After calling this method the
   * resource should be in the state it is after calling init. If the resource
   * depends on external resources (such as rules files) then the resource will
   * re-read those resources. If the data used to create the resource has
   * changed since the resource has been created then the resource will change
   * too after calling reInit().
   */
  public void reInit() throws ResourceInstantiationException {
    init();
  }

  
  /**
   * Override the default behaviour by interrupting the SphinxWrapper itself. 
   */
  public void interrupt() {
    this.interrupted = true;
    if (crawler != null) {
      crawler.interrupt();
    }
    
  }
  
  
  
  /**
   * This method runs the crawler. It assumes that all the needed
   * parameters are set. If they are not, an exception will be fired.
   */
  public void execute() throws ExecutionException {
    this.interrupted = false;
    DownloadParameters downloadParameters = new DownloadParameters();
    downloadParameters = downloadParameters.changeUserAgent(userAgent);
    downloadParameters = downloadParameters.changeMaxPageSize(maxPageSize);
    
    crawler = new SphinxWrapper();
    crawler.clear();
    crawler.setDownloadParameters(downloadParameters);
    crawler.setKeywords(keywords, caseSensitiveKeywords);
    crawler.setConvertXmlTypes(convertXmlTypes);
    crawler.resetCounter();
    
    if(outputCorpus == null) { 
        throw new ExecutionException("Output Corpus cannot be null");
    }

    if ( (root == null) && (source == null) ) {
        throw new ExecutionException("Either root or source must be initialized");
    }
    if(depth < 0) {
        throw new ExecutionException("Limit is not initialized");
    }
    if(dfs == null) {
        throw new ExecutionException("dfs is not initialized");
    }
    if(domain == null) {
      throw new ExecutionException("domain type is not initialized.. Set to either SERVER/SUBTREE/WEB");
    }

    try {
      crawler.setCorpus(outputCorpus);
      crawler.setDepth(depth);
      crawler.setDepthFirst(dfs.booleanValue());
      
      if(domain.equals(DomainMode.SUBTREE)) {
        crawler.setDomain(Crawler.SUBTREE);
      }
      else if(domain.equals(DomainMode.SERVER)) {
        crawler.setDomain(Crawler.SERVER);
      }
      else {
        crawler.setDomain(Crawler.WEB);
      }

      crawler.setMaxPages(maxFetch);
      crawler.setMaxKeep(maxKeep);

      if (root != null && (root.length() > 0)) {
        crawler.addStartLink(root);
      }

      if (source != null) {
        for(int i = 0; i < source.size(); i++) {
          boolean docWasLoaded = source.isDocumentLoaded(i);
          Document doc = (Document) source.get(i);
          URL url = doc.getSourceUrl();
          if (url != null) {
            crawler.addStartLink(url);
          }
          else {
            System.out.println("Skipping source document:" + doc.getName());
          }
          
          if(! docWasLoaded) {
            source.unloadDocument(doc);
            Factory.deleteResource(doc);
          }
        }
      }
      
      crawler.start();
      
      if (this.interrupted) {
        throw new ExecutionInterruptedException();
      }
    
    }
    catch(Exception e) {
      String nl = Strings.getNl();
      Err.prln("  Exception was: " + e + nl + nl);
      e.printStackTrace();
    }
  }

  
  
  /*  CREOLE PARAMETERS  */
  
  @Optional
  @RunTime
  @CreoleParameter(comment = "The starting URL for the crawl")
  public void setRoot(String root) {
    this.root = root;
  }

  public String getRoot() {
    return this.root;
  }

  @RunTime
  @CreoleParameter(comment = "The depth to which the crawl must proceed",
    defaultValue = "3")
  public void setDepth(Integer limit) {
    this.depth = limit.intValue();
  }

  public Integer getDepth() {
    return new Integer(this.depth);
  }

  @RunTime
  @CreoleParameter(comment = "true for depth-first search; false for breadth-first search",
          defaultValue = "true")
  public void setDfs(Boolean dfs) {
    this.dfs = dfs;
  }

  public Boolean getDfs() {
    return this.dfs;
  }
  
  
  @Optional
  @RunTime
  @CreoleParameter(comment = "HTTP User Agent to spoof (leave blank for default)",
          defaultValue = "")
  public void setUserAgent(String ua) {
    this.userAgent = ua;
  }
  
  public String getUserAgent() {
    return this.userAgent;
  }

  @Optional
  @RunTime
  @CreoleParameter(comment = "max page size in kB (0 for no limit)", defaultValue = "100")
  public void setMaxPageSize(Integer mps) {
    this.maxPageSize = mps.intValue();
  }
  
  public Integer getMaxPageSize() {
    return Integer.valueOf(this.maxPageSize);
  }
  

  @RunTime
  @CreoleParameter(comment = "The domain restriction for the crawl",
          defaultValue = "SUBTREE")
  public void setDomain(DomainMode domain) {
    this.domain = domain;
  }

  public DomainMode getDomain() {
    return this.domain;
  }

  @RunTime
  @Optional
  @CreoleParameter(comment = "corpus whose gate.SourceURL document features will be used to seed the crawl")
  public void setSource(Corpus source) {
    this.source = source;
  }

  public Corpus getSource() {
    return this.source;
  }

  @RunTime
  @Optional
  @CreoleParameter(comment = "Stop the crawl after fetching this many pages (-1 to ignore)",
          defaultValue = "-1")
  public void setStopAfter(Integer max) {
    this.maxFetch = max.intValue();
  }

  // stopAfter was maxFetch in AF's first revision
  public Integer getStopAfter() {
    return Integer.valueOf(this.maxFetch);
  }
  
  @RunTime
  @Optional
  @CreoleParameter(comment = "Stop the crawl after saving this many pages (-1 to ignore)",
          defaultValue = "-1")
  public void setMax(Integer max) {
    this.maxKeep = max.intValue();
  }
  
  // max was maxKeep in AF's first revision;
  public Integer getMax() {
    return Integer.valueOf(this.maxKeep);
  }

  @RunTime
  @CreoleParameter(comment = "Store the crawl output here")
  public void setOutputCorpus(Corpus outputCorpus) {
    this.outputCorpus = outputCorpus;
  }

  public Corpus getOutputCorpus() {
    return outputCorpus;
  }
  
  @Optional
  @RunTime
  @CreoleParameter(comment = "Pages that don't match at least one keyword will be dropped; leave empty to keep all pages")
  public void setKeywords(List<String> keywords) {
    this.keywords = keywords;
  }
   
  public List<String> getKeywords() {
    return this.keywords;
  }

  
  @RunTime
  @CreoleParameter(comment = "Are keywords case-sensitive?",
          defaultValue = "true")
  public void setKeywordsCaseSensitive(Boolean kcs) {
    this.caseSensitiveKeywords = kcs;
  }
  
  public Boolean getKeywordsCaseSensitive() {
    return this.caseSensitiveKeywords;
  }
  
  @RunTime
  @CreoleParameter(comment = "Convert other XML mime types to text/xml",
          defaultValue = "true")
  public void setConvertXmlTypes(Boolean convert) {
    this.convertXmlTypes = convert;
  }
  
  public Boolean getConvertXmlTypes() {
    return this.convertXmlTypes;
  }
  
  @HiddenCreoleParameter
  public void setDocument(Document x) {
    // NOTHING
  }

  @HiddenCreoleParameter
  public void setCorpus(Corpus x) {
    // NOTHING
  }

}