GATE.ac.uk - gate/plugins/Web_Crawler_Websphinx/src/crawl/SphinxWrapper.java

/*
 *  CrawlPR.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 */
package crawl;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.corpora.DocumentImpl;
import gate.creole.ResourceInstantiationException;
import gate.persist.PersistenceException;

import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Date;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import javax.mail.internet.ContentType;
import javax.mail.internet.ParseException;

import org.apache.commons.lang.StringUtils;

import websphinx.Crawler;
import websphinx.Link;
import websphinx.Page;


public class SphinxWrapper extends Crawler{

  private static final long serialVersionUID = -6524027714398026402L;
  @SuppressWarnings("unused")
  private static final String __SVNID = "$Id: SphinxWrapper.java 17662 2014-03-14 16:19:05Z markagreenwood $";

  private Corpus corpus = null;
  private static int maxFetch = -1;
  private static int maxKeep  = -1;
  private static AtomicInteger countFetched, countKept;
  private static boolean ignoreKeywords;
  private static boolean caseSensitiveKeywords;
  private static boolean convertXmlTypes;
  private static List<String> keywords;

  
  protected void setKeywords(List<String> newKeywords, boolean caseSensitive) {
    keywords = newKeywords;
    ignoreKeywords = (keywords == null) || keywords.isEmpty();
    caseSensitiveKeywords = caseSensitive;
  }
  
  
  protected void setConvertXmlTypes(boolean convert) {
    convertXmlTypes = convert;
  }
  
  
  @SuppressWarnings("unchecked")
  public void visit(Page p) {
    if ( ( (maxFetch != -1) && (countFetched.get() >= maxFetch) ) ||
         ( (maxKeep != -1) && (countKept.get() >= maxKeep) ) )    {
      syncIfNecessary();
      super.stop();
      return;
    }

    int currentFetched = countFetched.incrementAndGet();
    String urlString = p.toURL();
    int depth = p.getDepth();
    Document doc = makeDocument(p);
    p.discardContent();
    
    /* For the keyword-matching, we tried p.toText() but it doesn't
     * parse JavaScript as well as GATE's HTML parser.       */

    if (doc == null)  {// failed to produce a valid gate.Document
      System.out.println(countKept.toString() + " / " + currentFetched + 
              " [" + depth + "] Drop: " + urlString);
    }

    else if (ignoreKeywords || containsAnyKeyword(doc, keywords, caseSensitiveKeywords)) {    
      // produced a valid gate.Document
      // keyword match succeeded
      corpus.add(doc);
      int currentCount = countKept.incrementAndGet();

      if (corpus.getLRPersistenceId() != null) {
        corpus.unloadDocument(doc);
        Factory.deleteResource(doc);
      }
      System.out.println(currentCount + " / " + currentFetched + 
              " [" + depth + "] Keep: " + urlString);
    }
    
    else {  // keyword match failed
      System.out.println(countKept.toString() + " / " + currentFetched + 
              " [" + depth + "] Drop: " + urlString);
      Factory.deleteResource(doc);
    }
  }
  

  public boolean shouldVisit(Link l) {
    return super.shouldVisit(l);
  }

  protected void setDepth(int depth) {
    super.setMaxDepth(depth);
  }

  protected void setMaxPages(int max) {
    maxFetch = max;
  }
  
  protected void setMaxKeep(int max) {
    maxKeep = max;
  }

  protected int getMaxPages() {
    return maxFetch;
  }
  
  protected int getMaxKeep() {
    return maxKeep;
  }


  protected void addStartLink(String root) {
    try {
      URL url = new URL(root);
      Link link = new Link(url);
      System.out.println("Adding seed URL  " + url.toString());
      super.addRoot(link);
    }
    catch (MalformedURLException me) {
      System.err.println("Malformed url "+root);
      me.printStackTrace();
    }
  }

  protected void addStartLink(URL url) {
    Link link = new Link(url);
    System.out.println("Adding seed URL  " + url.toString());
    super.addRoot(link);
  }
  

  public void setCorpus(Corpus corpus) {
    this.corpus = corpus;
  }


  /* yes: application/rss+xml.xml
   * no:  image/svg+xml.xml
   */
  private static String convertMimeType(String originalType) {
    String result = originalType;
    if (originalType.endsWith("xml")
            && (originalType.startsWith("application") || originalType.startsWith("application") )
    ) {
      result = "text/xml";
    }
    return result;
  }
  
  
  public void start() {
    super.run();
  }
  
  protected void resetCounter() {
    countFetched = new AtomicInteger(0);
    countKept    = new AtomicInteger(0);
  }
  
  protected void interrupt()  {
    super.stop();
    syncIfNecessary();
  }

  private void syncIfNecessary() {
    if (corpus.getLRPersistenceId() != null) {
      try {
        corpus.sync();
      }
      catch(PersistenceException e) {
        e.printStackTrace();
      }
      catch(SecurityException e) {
        e.printStackTrace();
      }
    }
  }

  
  private static boolean containsAnyKeyword(Document document, List<String> keywords, boolean caseSensitive) {
    return containsAnyKeyword(document.getContent().toString(), keywords, caseSensitive);
  }
  

  private static boolean containsAnyKeyword(String content, List<String> keywords, boolean caseSensitive) {
    if ( (keywords == null) || keywords.isEmpty()) {
      return true;
    }
    
    // implied else: test the keywords
    if (caseSensitive) {
      for (String kw : keywords) {
        if (StringUtils.contains(content, kw)) {
          return true;  
        }
      }
    }
    
    else { // case-insensitive
      for (String kw : keywords) {
        if (StringUtils.containsIgnoreCase(content, kw)) {
          return true;
        }
      }
    }
    
    return false;
  }


 
  private static Document makeDocument(Page page) {
    String url = page.toURL();
    FeatureMap params = Factory.newFeatureMap();
    
    Document doc = null;

    String docName = shortenUrl(url).replaceAll("[^\\p{ASCII}]", "_") + "_" + Gate.genSym();

    /* Take advantage of the MIME type from the server when
     * constructing the GATE document.      */
    String contentTypeStr = page.getContentType();
    String originalMimeType = null;

    if (contentTypeStr != null) {
      try {
        ContentType contentType = new ContentType(contentTypeStr);
        String mimeType = contentType.getBaseType();
        String encoding = contentType.getParameter("charset");
        
        // get the content as bytes, and convert it to string using the correct
        // encoding (thanks to Christian Wartena for patch)
        byte[] bContent = page.getContentBytes();
        String sContent = new String(bContent,Charset.forName(encoding));
        params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, sContent);

        if (mimeType != null) {
          if (convertXmlTypes) {
            originalMimeType = mimeType;
            mimeType = convertMimeType(mimeType);
            if (! originalMimeType.equals(mimeType)) {
              System.out.println("   convert " + originalMimeType + " -> " + mimeType);
            }
          }
          params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
        }

        if (encoding != null) {
          params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);

        }
      } catch(ParseException e) {
        e.printStackTrace();
      }
    }


    try {
      doc = (Document) Factory.createResource(
              DocumentImpl.class.getName(), params, null, docName);
      FeatureMap docFeatures = doc.getFeatures();

      Integer originalLength = page.getLength();
      docFeatures.put("originalLength", originalLength);
      
      /* Use the Last-Modified HTTP header if available.  */
      long lastModified = page.getLastModified();
      Date date;
      if (lastModified > 0L) {
        date = new Date(lastModified);
      }
      else {
        date = new Date();
      }
      docFeatures.put("Date", date);
      
      if (originalMimeType != null) {
        docFeatures.put("originalMimeType", originalMimeType);
      }
      
      doc.setSourceUrl(page.getURL());
      docFeatures.put("gate.SourceURL", url);
    }
    catch (ResourceInstantiationException e) {
      System.err.println("WARNING: could not intantiate document " + docName);
      e.printStackTrace();
    }

    return doc;
  }

  
  private static String shortenUrl(String url) {
    String result = url.replaceAll("//+", "/");
    int s0 = StringUtils.lastIndexOf(url, '/');
    int s1 = StringUtils.lastIndexOf(url, '/', s0 -1 );
    if (s1 > 0) {
      result = url.substring(s1 + 1);
    }
    return result;
  }
  
  
}