GATE.ac.uk - gatewiki/nutch-solr/src/java/gate/solr/NutchCrawler.java

/*
 *  NutchCrawler.java
 *  Copyright (c) 1998-2008, The University of Sheffield.
 *
 *  This code is from the GATE project (http://gate.ac.uk/) and is free
 *  software licenced under the GNU General Public License version 3. It is
 *  distributed without any warranty. For more details see COPYING.txt in the
 *  top level directory (or at http://gatewiki.sf.net/COPYING.txt).
 *
 *  Niraj Aswani 13 March 2009
 *
 */
package gate.solr;

import java.util.*;
import java.text.*;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.nutch.fetcher.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.crawl.*;

/**
 * Literally a copy of the Crawl.java. The only difference is that it uses the
 * Fetcher2 implementation instead of the Fetcher implementation.
 */
public class NutchCrawler {

  // logging
  public static final Log LOG = LogFactory.getLog(NutchCrawler.class);

  /** Return current date in the yyyMMddHHmmss format */
  private static String getDate() {
    return new SimpleDateFormat("yyyyMMddHHmmss").format
      (new Date(System.currentTimeMillis()));
  }


  /* Perform complete crawling given a set of root urls. */
  public static void main(String args[]) throws Exception {
    System.out.println("Using Fetcher2 implementation");
    if (args.length < 1) {
      System.out.println("Usage: NutchCrawler " + 
                     "<urlDir> [-dir d] [-threads n] [-depth i] [-topN N]");
      return;
    }

    // create a configuration object
    Configuration conf = NutchConfiguration.create();
    conf.addResource("crawl-tool.xml");
    JobConf job = new NutchJob(conf);

    Path rootUrlDir = null;
    Path dir = new Path("crawl-" + getDate());
    int threads = job.getInt("fetcher.threads.fetch", 10);
    int depth = 5;
    long topN = Long.MAX_VALUE;
    
    // parsing arguments
    for (int i = 0; i < args.length; i++) {
      if ("-dir".equals(args[i])) {
        dir = new Path(args[i+1]);
        i++;
      } else if ("-threads".equals(args[i])) {
        threads = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-depth".equals(args[i])) {
        depth = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-topN".equals(args[i])) {
          topN = Integer.parseInt(args[i+1]);
          i++;
      } else if (args[i] != null) {
        rootUrlDir = new Path(args[i]);
      }
    }

    FileSystem fs = FileSystem.get(job);

    if (LOG.isInfoEnabled()) {
      LOG.info("crawl started in: " + dir);
      LOG.info("rootUrlDir = " + rootUrlDir);
      LOG.info("threads = " + threads);
      LOG.info("depth = " + depth);
      if (topN != Long.MAX_VALUE)
        LOG.info("topN = " + topN);
    }
    
    Path crawlDb = new Path(dir + "/crawldb");
    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");

    Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
    Injector injector = new Injector(conf);
    Generator generator = new Generator(conf);

    // using fetcher2 implementation
    Fetcher2 fetcher = new Fetcher2(conf);

    // Fetcher 1 implementation
    //Fetcher fetcher = new Fetcher(conf);
    
    // segmenter parser
    ParseSegment parseSegment = new ParseSegment(conf);
    CrawlDb crawlDbTool = new CrawlDb(conf);
    LinkDb linkDbTool = new LinkDb(conf);
      
    // initialize crawlDb
    injector.inject(crawlDb, rootUrlDir);
    int i;
    for (i = 0; i < depth; i++) {             

      // generate new segment
      Path segment = generator.generate(crawlDb, segments, -1, topN, 
                                                  System.currentTimeMillis());

      if (segment == null) {
        System.out.println("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      
      // fetch it
      fetcher.fetch(segment, threads, true);  
      
      // parse it, if needed
      if (!Fetcher2.isParsing(job)) {
        parseSegment.parse(segment);    
      }

      // update crawldb
      crawlDbTool.update(crawlDb, new Path[]{segment}, true, true); 
    }
    
    
    if (i > 0) {
      // invert links
      linkDbTool.invert(linkDb, segments, true, true, false); 
    } else {
      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
    }
    if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
  }
}