Log in Help
Print
Homereleasesgate-5.2.1-build3581-ALLpluginsWeb_Crawler_Websphinxsrccrawl 〉 SphinxWrapper.java
 
package crawl;

import websphinx.*;

import gate.creole.*;
import gate.persist.PersistenceException;
import gate.security.SecurityException;
import gate.util.*;
import gate.corpora.*;
import gate.*;

import java.net.*;
import java.util.List;


public class SphinxWrapper extends Crawler{

  private static final long serialVersionUID = -4720932687929012109L;
  
  private Corpus corpus = null;
  private static int maxPages = -1;
  private static int count = 0;

  
  protected void resetCounter() {
    count = 0;
  }
  
  
  @SuppressWarnings("unchecked")
  public void visit(Page p) {
    if ((maxPages != -1) && (count >= maxPages)) {
      count = 0;
      super.stop();
      return;
    }

    String docName = p.toURL() + "_" + Gate.genSym();
    FeatureMap params = Factory.newFeatureMap();
    params.put(Document.DOCUMENT_URL_PARAMETER_NAME, p.toURL());
    try {
      Document doc = (Document) Factory.createResource(
              DocumentImpl.class.getName(), params, null, docName
      );
      corpus.add(doc);
      if (corpus.getLRPersistenceId() != null) {
        corpus.unloadDocument(doc);
        Factory.deleteResource(doc);
      }

      System.out.println(count+" ["+p.getDepth()+"] "+p.toURL());
      count++;
    }
    catch (ResourceInstantiationException e) {
      String nl = Strings.getNl();
      Err.prln(
              "WARNING: could not intantiate document" + nl +
              "  Document name was: " + docName + nl +
              "  Exception was: " + e + nl + nl
      );
    }
  }

  public boolean shouldVisit(Link l) {
    return super.shouldVisit(l);
  }

  public void setDepth(int depth) {
    super.setMaxDepth(depth);
  }

  public void setMaxPages(int max) {
    maxPages = max;
  }

  public int getMaxPages() {
    return maxPages;
  }

  protected void interrupt()  {
    super.stop();
    if (corpus.getLRPersistenceId() != null) {
      try {
        corpus.sync();
      }
      catch(PersistenceException e) {
        e.printStackTrace();
      }
      catch(SecurityException e) {
        e.printStackTrace();
      }
    }
  }

  
  public void setStart(String root) {
    try {
      URL url = new URL(root);
      Link link = new Link(url);
      super.setRoot(link);
    }
    catch (MalformedURLException me) {
      System.err.println("Malformed url "+root);
      me.printStackTrace();
    }
  }

  public void setStart(URL root) {
    Link link = new Link(root);
    super.setRoot(link);
  }


  public void setStarts(List<URL> roots) {
    int n = roots.size();
    Link[] links = new Link[roots.size()];
    for (int i=0; i < n ; i++) {
      links[i] = new Link(roots.get(i));
    }
    super.setRoots(links);
  }

  public void setCorpus(Corpus corpus) {
    this.corpus = corpus;
  }

  public void start() {
    super.run();
  }

}