/*
* CrawlPR.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
*/
package crawl;
import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.corpora.DocumentImpl;
import gate.creole.ResourceInstantiationException;
import gate.persist.PersistenceException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Date;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import javax.mail.internet.ContentType;
import javax.mail.internet.ParseException;
import org.apache.commons.lang.StringUtils;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.Page;
public class SphinxWrapper extends Crawler{
private static final long serialVersionUID = -6524027714398026402L;
@SuppressWarnings("unused")
private static final String __SVNID = "$Id: SphinxWrapper.java 17662 2014-03-14 16:19:05Z markagreenwood $";
private Corpus corpus = null;
private static int maxFetch = -1;
private static int maxKeep = -1;
private static AtomicInteger countFetched, countKept;
private static boolean ignoreKeywords;
private static boolean caseSensitiveKeywords;
private static boolean convertXmlTypes;
private static List<String> keywords;
protected void setKeywords(List<String> newKeywords, boolean caseSensitive) {
keywords = newKeywords;
ignoreKeywords = (keywords == null) || keywords.isEmpty();
caseSensitiveKeywords = caseSensitive;
}
protected void setConvertXmlTypes(boolean convert) {
convertXmlTypes = convert;
}
@SuppressWarnings("unchecked")
public void visit(Page p) {
if ( ( (maxFetch != -1) && (countFetched.get() >= maxFetch) ) ||
( (maxKeep != -1) && (countKept.get() >= maxKeep) ) ) {
syncIfNecessary();
super.stop();
return;
}
int currentFetched = countFetched.incrementAndGet();
String urlString = p.toURL();
int depth = p.getDepth();
Document doc = makeDocument(p);
p.discardContent();
/* For the keyword-matching, we tried p.toText() but it doesn't
* parse JavaScript as well as GATE's HTML parser. */
if (doc == null) {// failed to produce a valid gate.Document
System.out.println(countKept.toString() + " / " + currentFetched +
" [" + depth + "] Drop: " + urlString);
}
else if (ignoreKeywords || containsAnyKeyword(doc, keywords, caseSensitiveKeywords)) {
// produced a valid gate.Document
// keyword match succeeded
corpus.add(doc);
int currentCount = countKept.incrementAndGet();
if (corpus.getLRPersistenceId() != null) {
corpus.unloadDocument(doc);
Factory.deleteResource(doc);
}
System.out.println(currentCount + " / " + currentFetched +
" [" + depth + "] Keep: " + urlString);
}
else { // keyword match failed
System.out.println(countKept.toString() + " / " + currentFetched +
" [" + depth + "] Drop: " + urlString);
Factory.deleteResource(doc);
}
}
public boolean shouldVisit(Link l) {
return super.shouldVisit(l);
}
protected void setDepth(int depth) {
super.setMaxDepth(depth);
}
protected void setMaxPages(int max) {
maxFetch = max;
}
protected void setMaxKeep(int max) {
maxKeep = max;
}
protected int getMaxPages() {
return maxFetch;
}
protected int getMaxKeep() {
return maxKeep;
}
protected void addStartLink(String root) {
try {
URL url = new URL(root);
Link link = new Link(url);
System.out.println("Adding seed URL " + url.toString());
super.addRoot(link);
}
catch (MalformedURLException me) {
System.err.println("Malformed url "+root);
me.printStackTrace();
}
}
protected void addStartLink(URL url) {
Link link = new Link(url);
System.out.println("Adding seed URL " + url.toString());
super.addRoot(link);
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
}
/* yes: application/rss+xml.xml
* no: image/svg+xml.xml
*/
private static String convertMimeType(String originalType) {
String result = originalType;
if (originalType.endsWith("xml")
&& (originalType.startsWith("application") || originalType.startsWith("application") )
) {
result = "text/xml";
}
return result;
}
public void start() {
super.run();
}
protected void resetCounter() {
countFetched = new AtomicInteger(0);
countKept = new AtomicInteger(0);
}
protected void interrupt() {
super.stop();
syncIfNecessary();
}
private void syncIfNecessary() {
if (corpus.getLRPersistenceId() != null) {
try {
corpus.sync();
}
catch(PersistenceException e) {
e.printStackTrace();
}
catch(SecurityException e) {
e.printStackTrace();
}
}
}
private static boolean containsAnyKeyword(Document document, List<String> keywords, boolean caseSensitive) {
return containsAnyKeyword(document.getContent().toString(), keywords, caseSensitive);
}
private static boolean containsAnyKeyword(String content, List<String> keywords, boolean caseSensitive) {
if ( (keywords == null) || keywords.isEmpty()) {
return true;
}
// implied else: test the keywords
if (caseSensitive) {
for (String kw : keywords) {
if (StringUtils.contains(content, kw)) {
return true;
}
}
}
else { // case-insensitive
for (String kw : keywords) {
if (StringUtils.containsIgnoreCase(content, kw)) {
return true;
}
}
}
return false;
}
private static Document makeDocument(Page page) {
String url = page.toURL();
FeatureMap params = Factory.newFeatureMap();
Document doc = null;
String docName = shortenUrl(url).replaceAll("[^\\p{ASCII}]", "_") + "_" + Gate.genSym();
/* Take advantage of the MIME type from the server when
* constructing the GATE document. */
String contentTypeStr = page.getContentType();
String originalMimeType = null;
if (contentTypeStr != null) {
try {
ContentType contentType = new ContentType(contentTypeStr);
String mimeType = contentType.getBaseType();
String encoding = contentType.getParameter("charset");
// get the content as bytes, and convert it to string using the correct
// encoding (thanks to Christian Wartena for patch)
byte[] bContent = page.getContentBytes();
String sContent = new String(bContent,Charset.forName(encoding));
params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, sContent);
if (mimeType != null) {
if (convertXmlTypes) {
originalMimeType = mimeType;
mimeType = convertMimeType(mimeType);
if (! originalMimeType.equals(mimeType)) {
System.out.println(" convert " + originalMimeType + " -> " + mimeType);
}
}
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
}
if (encoding != null) {
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
}
} catch(ParseException e) {
e.printStackTrace();
}
}
try {
doc = (Document) Factory.createResource(
DocumentImpl.class.getName(), params, null, docName);
FeatureMap docFeatures = doc.getFeatures();
Integer originalLength = page.getLength();
docFeatures.put("originalLength", originalLength);
/* Use the Last-Modified HTTP header if available. */
long lastModified = page.getLastModified();
Date date;
if (lastModified > 0L) {
date = new Date(lastModified);
}
else {
date = new Date();
}
docFeatures.put("Date", date);
if (originalMimeType != null) {
docFeatures.put("originalMimeType", originalMimeType);
}
doc.setSourceUrl(page.getURL());
docFeatures.put("gate.SourceURL", url);
}
catch (ResourceInstantiationException e) {
System.err.println("WARNING: could not intantiate document " + docName);
e.printStackTrace();
}
return doc;
}
private static String shortenUrl(String url) {
String result = url.replaceAll("//+", "/");
int s0 = StringUtils.lastIndexOf(url, '/');
int s1 = StringUtils.lastIndexOf(url, '/', s0 -1 );
if (s1 > 0) {
result = url.substring(s1 + 1);
}
return result;
}
}