/*
* CrawlPR.java
*
* Copyright (c) 1998-2004, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Google API and other sources subject to Google License. Please
* see http://www.google.com/apis/
*/
package crawl;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.*;
import gate.gui.MainFrame;
import gate.corpora.*;
import gate.util.*;
import gate.*;
import websphinx.*;
public class CrawlPR extends AbstractLanguageAnalyser implements
ProcessingResource {
private String root = null;
private int depth = -1;
private Corpus outputCorpus = null;
private Boolean dfs = null;
private SphinxWrapper crawler;
private String domain = null;
private Corpus source = null;
private int max = -1;
/** Constructor of the class */
public CrawlPR() {
}
/** Initialise this resource, and return it. */
public Resource init() throws ResourceInstantiationException {
return super.init();
}
/**
* Reinitialises the processing resource. After calling this method the
* resource should be in the state it is after calling init. If the resource
* depends on external resources (such as rules files) then the resource will
* re-read those resources. If the data used to create the resource has
* changed since the resource has been created then the resource will change
* too after calling reInit().
*/
public void reInit() throws ResourceInstantiationException {
init();
}
/**
* This method runs the coreferencer. It assumes that all the needed
* parameters are set. If they are not, an exception will be fired.
*/
public void execute() throws ExecutionException {
crawler = new SphinxWrapper();
if(outputCorpus == null) { throw new ExecutionException(
"Output Corpus cannot be null"); }
if(root == null && source == null) { throw new ExecutionException(
"Either root or source must be initialized"); }
if(depth == -1) { throw new ExecutionException("Limit is not initialized"); }
if(dfs == null) { throw new ExecutionException("dfs is not initialized"); }
if(domain == null) { throw new ExecutionException(
"domain type is not initialized.. Set to either SERVER/SUBTREE/WEB"); }
try {
crawler.setCorpus(outputCorpus);
crawler.setDepth(depth);
crawler.setDepthFirst(dfs.booleanValue());
if(domain == "SUBTREE") {
crawler.setDomain(Crawler.SUBTREE);
}
else if(domain == "SERVER") {
crawler.setDomain(Crawler.SERVER);
}
else {
crawler.setDomain(Crawler.WEB);
}
if(max != -1) {
crawler.setMaxPages(max);
}
if(root != null && root != "") {
crawler.setStart(root);
}
else {
CorpusImpl roots = (CorpusImpl)source;
// System.out.println("using the
// outputCorpus"+roots.getDocumentName(0));
Object rootArray[] = roots.toArray();
for(int i = 0; i < rootArray.length; i++) {
DocumentImpl doc = (DocumentImpl)rootArray[i];
System.out.println("adding ... " + doc.getSourceUrl().toString()
+ "\n");
crawler.setStart(doc.getSourceUrl());
}
}
crawler.start();
}
catch(Exception e) {
String nl = Strings.getNl();
Err.prln(" Exception was: " + e + nl + nl);
}
}
public void setRoot(String root) {
this.root = root;
}
public String getRoot() {
return this.root;
}
public void setDepth(Integer limit) {
this.depth = limit.intValue();
}
public Integer getDepth() {
return new Integer(this.depth);
}
public void setDfs(Boolean dfs) {
this.dfs = dfs;
}
public Boolean getDfs() {
return this.dfs;
}
public void setDomain(String domain) {
this.domain = domain;
}
public String getDomain() {
return this.domain;
}
public void setSource(Corpus source) {
this.source = source;
}
public Corpus getSource() {
return this.source;
}
public void setMax(Integer max) {
this.max = max.intValue();
}
public Integer getMax() {
return new Integer(this.max);
}
public Corpus getOutputCorpus() {
return outputCorpus;
}
public void setOutputCorpus(Corpus outputCorpus) {
this.outputCorpus = outputCorpus;
}
}