GATE.ac.uk - releases/gate-5.1-beta2-build3402-ALL/plugins/Web_Crawler

/*
 *  CrawlPR.java
 *
 *  Copyright (c) 1998-2004, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Google API and other sources subject to Google License. Please
 *  see http://www.google.com/apis/
 */

package crawl;

import gate.ProcessingResource;
import gate.Resource;
import gate.creole.*;
import gate.gui.MainFrame;
import gate.corpora.*;
import gate.util.*;
import gate.*;

import websphinx.*;

public class CrawlPR extends AbstractLanguageAnalyser implements
                                                     ProcessingResource {

  private String root = null;
  private int depth = -1;
  private Corpus outputCorpus = null;
  private Boolean dfs = null;
  private SphinxWrapper crawler;
  private String domain = null;
  private Corpus source = null;
  private int max = -1;

  /** Constructor of the class */
  public CrawlPR() {

  }

  /** Initialise this resource, and return it. */
  public Resource init() throws ResourceInstantiationException {
    return super.init();
  }

  /**
   * Reinitialises the processing resource. After calling this method the
   * resource should be in the state it is after calling init. If the resource
   * depends on external resources (such as rules files) then the resource will
   * re-read those resources. If the data used to create the resource has
   * changed since the resource has been created then the resource will change
   * too after calling reInit().
   */
  public void reInit() throws ResourceInstantiationException {
    init();
  }

  /**
   * This method runs the coreferencer. It assumes that all the needed
   * parameters are set. If they are not, an exception will be fired.
   */
  public void execute() throws ExecutionException {
    crawler = new SphinxWrapper();
    if(outputCorpus == null) { throw new ExecutionException(
      "Output Corpus cannot be null"); }

    if(root == null && source == null) { throw new ExecutionException(
      "Either root or source must be initialized"); }
    if(depth == -1) { throw new ExecutionException("Limit is not initialized"); }
    if(dfs == null) { throw new ExecutionException("dfs is not initialized"); }
    if(domain == null) { throw new ExecutionException(
      "domain type is not initialized.. Set to either SERVER/SUBTREE/WEB"); }

    try {
      crawler.setCorpus(outputCorpus);
      crawler.setDepth(depth);
      crawler.setDepthFirst(dfs.booleanValue());
      if(domain == "SUBTREE") {
        crawler.setDomain(Crawler.SUBTREE);
      }
      else if(domain == "SERVER") {
        crawler.setDomain(Crawler.SERVER);
      }
      else {
        crawler.setDomain(Crawler.WEB);
      }
      if(max != -1) {
        crawler.setMaxPages(max);
      }
      if(root != null && root != "") {
        crawler.setStart(root);
      }
      else {
        CorpusImpl roots = (CorpusImpl)source;
        // System.out.println("using the
        // outputCorpus"+roots.getDocumentName(0));
        Object rootArray[] = roots.toArray();
        for(int i = 0; i < rootArray.length; i++) {
          DocumentImpl doc = (DocumentImpl)rootArray[i];
          System.out.println("adding ... " + doc.getSourceUrl().toString()
            + "\n");
          crawler.setStart(doc.getSourceUrl());
        }
      }

      crawler.start();

    }
    catch(Exception e) {
      String nl = Strings.getNl();
      Err.prln("  Exception was: " + e + nl + nl);
    }
  }

  public void setRoot(String root) {
    this.root = root;
  }

  public String getRoot() {
    return this.root;
  }

  public void setDepth(Integer limit) {
    this.depth = limit.intValue();
  }

  public Integer getDepth() {
    return new Integer(this.depth);
  }

  public void setDfs(Boolean dfs) {
    this.dfs = dfs;
  }

  public Boolean getDfs() {
    return this.dfs;
  }

  public void setDomain(String domain) {
    this.domain = domain;
  }

  public String getDomain() {
    return this.domain;
  }

  public void setSource(Corpus source) {
    this.source = source;
  }

  public Corpus getSource() {
    return this.source;
  }

  public void setMax(Integer max) {
    this.max = max.intValue();
  }

  public Integer getMax() {
    return new Integer(this.max);
  }

  public Corpus getOutputCorpus() {
    return outputCorpus;
  }

  public void setOutputCorpus(Corpus outputCorpus) {
    this.outputCorpus = outputCorpus;
  }

}