/*
* SolrIndexer.java
* Copyright (c) 1998-2008, The University of Sheffield.
*
* This code is from the GATE project (http://gate.ac.uk/) and is free
* software licenced under the GNU General Public License version 3. It is
* distributed without any warranty. For more details see COPYING.txt in the
* top level directory (or at http://gatewiki.sf.net/COPYING.txt).
*
* Niraj Aswani 13 March 2009
*
*/
package gate.solr;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.ArrayList;
import java.lang.reflect.*;
import java.io.*;
import java.net.*;
import org.apache.log4j.Logger;
import org.apache.nutch.crawl.Crawl;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
/**
* This class is a thin layer over the nutch code that allows adding, deleting
* and updating files in solr.
*/
public class SolrIndexer {
/** Logger. */
static Logger lgr = Logger.getLogger(SolrIndexer.class);
/** URL of the server running solr index */
String solrUrl;
/** Construction. */
public SolrIndexer(String solrUrl) {
this.solrUrl = solrUrl;
System.out.println(getClass().getClassLoader()
.getResource("nutch-site.xml").toString());
System.out.println(getClass().getClassLoader()
.getResource("plugins").toString());
System.out.println(getClass().getClassLoader()
.getResource("urlfilter-default.txt").toString());
} // SolrUpdater()
/**
* Removes the file from solr index
*
* @param fileUrls
* A list of URLs of the files to be deleted
* @param coreId
* core that holds the given file
* @return true if file is deleted successfully, false otherwise
*/
public boolean delete(List<String> fileUrls, int coreId) {
// file to delete - this is the command that needs to be sent
StringBuilder query = new StringBuilder();
if(fileUrls.isEmpty()) {
query.append("*:*");
} else {
for(int i=0;i<fileUrls.size();i++) {
if(i != 0) query.append(" OR ");
query.append("url:\"" + fileUrls.get(i) + "\"");
}
}
return deleteByQuery(query.toString(), coreId);
} // delete(List<String>, int)
/**
* Removes documents that match the given query
*
* @param query
* search query to be used for obtaining a list of documents
* @param coreId
* core that holds the given file
* @return true if file is deleted successfully, false otherwise
*/
public boolean deleteByQuery(String query, int coreId) {
try {
// Send data
URL url = new URL(solrUrl + "/core" + coreId + "/update");
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
conn.setRequestProperty("Content-type", "text/xml");
conn.setRequestProperty("charset", "utf-8");
OutputStreamWriter wr = new OutputStreamWriter(conn.getOutputStream());
System.out.println("******************" + query);
wr.write("<delete><query>"+query+"</query></delete>");
wr.flush();
// Get the response
BufferedReader rd =
new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line;
while((line = rd.readLine()) != null) {
System.out.println(line);
}
wr.close();
rd.close();
// ok open up the connection again, this time for commit command
conn = url.openConnection();
conn.setDoOutput(true);
conn.setRequestProperty("Content-type", "text/xml");
conn.setRequestProperty("charset", "utf-8");
wr = new OutputStreamWriter(conn.getOutputStream());
wr.write("<commit/>");
wr.flush();
// receive the output
rd =
new BufferedReader(new InputStreamReader(conn.getInputStream()));
while((line = rd.readLine()) != null) {
System.out.println(line);
}
wr.close();
rd.close();
}
catch(Exception e) {
e.printStackTrace();
return false;
}
// successful
return true;
} // deleteByQuery(String, int)
/**
* Removes documents that match the given query
*
* @param query
* search query to be used for obtaining a list of documents
* @param coreId
* core that holds the given file
* @return true if file is deleted successfully, false otherwise
*/
public boolean optimize(int coreId) {
try {
// Send data
URL url = new URL(solrUrl + "/core" + coreId + "/update");
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
conn.setRequestProperty("Content-type", "text/xml");
conn.setRequestProperty("charset", "utf-8");
OutputStreamWriter wr = new OutputStreamWriter(conn.getOutputStream());
System.out.println("******************optimizing");
wr.write("<optimize/>");
wr.flush();
// Get the response
BufferedReader rd =
new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line;
while((line = rd.readLine()) != null) {
System.out.println(line);
}
wr.close();
rd.close();
}
catch(Exception e) {
e.printStackTrace();
return false;
}
// successful
return true;
} // deleteByQuery(String, int)
/**
* Removes the file from solr index and then adds it using nutch
*
* @param fileUrls
* A list of URLs of the files to be updated
* @param coreId
* core that holds the given file
* @return true if file is updated successfully, false otherwise
*/
public boolean update(List<String> fileUrls, int coreId) {
// delete it first
if(!delete(fileUrls, coreId)) { return false; }
// add it then
index(fileUrls, coreId);
return true;
} // update(List<String>, int)
/**
* Adds a new file to solrIndexer
*
* @param fileUrls
* a list of URLs of the files to be indexed
* @param coreId
* core in which this file should be indexed
* @return true if file is added successfully
*/
public boolean index(final List<String> fileUrls, final int coreId) {
System.out.println("Indexing started:");
// lets create a temporary folder
File indexDir = null;
File urls = null;
// by default depth is set to 1
// use one thread only - multiple threads useful while indexing a wiki area
String depth = "1";
String threads = "1";
try {
indexDir = new File(System.getProperty("java.io.tmpdir"),
"crawl-"+ getDate());
// create directory
indexDir.mkdirs();
// delete this folder on exit
indexDir.deleteOnExit();
// create a file with root url that lists fileUrl in it
urls = new File(indexDir, "urls");
BufferedWriter writer = new BufferedWriter(new FileWriter(urls));
// filter string to be added to crawl-urlfilter.txt
StringBuilder filteringRegexs = new StringBuilder();
for(String aFileUrl : fileUrls) {
writer.write(aFileUrl);
if(depth.equals("1") &&
new File(new URL(aFileUrl).toURI()).isDirectory()) {
if(!aFileUrl.endsWith("/")) {
aFileUrl += "/";
}
// removing file://
aFileUrl = aFileUrl.substring(8);
// asking it to index only html,text,doc and pdf files
filteringRegexs.append("\n+").append(aFileUrl)
.append(".*\\.(html|htm|HTML|HTM|text|txt|doc|pdf|tex)$");
// asking it to consider directories
filteringRegexs.append("\n+").append(aFileUrl).append(".*/$");
depth = "70";
threads = "30";
}
// and the file itself that user has asked for indexing
filteringRegexs.append("\n+").append(aFileUrl).append("$");
writer.newLine();
}
writer.close();
// need to copy default urlfilterings from urlfilter-default.txt
// to craw-urlfilter.txt
File crawlFilter = new File(indexDir, "crawl-urlfilter.txt");
writer = new BufferedWriter(new FileWriter(crawlFilter));
BufferedReader br = new BufferedReader(
new InputStreamReader(getClass().getClassLoader()
.getResourceAsStream("urlfilter-default.txt")));
String line = br.readLine();
while(line != null) {
writer.write(line);
writer.newLine();
line = br.readLine();
}
// add rules for top-level directory
writer.write(filteringRegexs.toString());
writer.newLine();
// exclude everything else
writer.write("-.");
writer.close();
// add the indexDir on classpath
addToClassPath(indexDir);
}
catch(IOException e1) {
e1.printStackTrace();
return false;
}
catch(URISyntaxException e1) {
e1.printStackTrace();
return false;
}
// nutch parameters
String[] nutchParams =
new String[]{urls.getAbsolutePath(), "-dir", indexDir.getAbsolutePath(),
"-depth", depth, "-threads", threads};
try {
// crawl and index
NutchCrawler.main(nutchParams);
// find out number of segments
File[] segmentDirectories = new File(indexDir, "segments").listFiles();
String[] solrParams = new String[segmentDirectories.length + 3];
// solr parameters
solrParams[0] = solrUrl + "/core" + coreId;
solrParams[1] = new File(indexDir, "crawldb").getAbsolutePath();
solrParams[2] = new File(indexDir, "linkdb").getAbsolutePath();
for(int i = 3; i < solrParams.length; i++) {
solrParams[i] = segmentDirectories[i - 3].getAbsolutePath();
}
// add it to solr
ToolRunner.run(NutchConfiguration.create(),
new org.apache.nutch.indexer.solr.SolrIndexer(), solrParams);
// delete directory listings
if(fileUrls != null && fileUrls.size() > 0) {
String query =
new File(new URL(fileUrls.get(0)).toURI()).getAbsolutePath();
deleteByQuery(query, coreId);
optimize(coreId);
}
}
catch(Exception e) {
e.printStackTrace();
return false;
}
// return normally
return true;
} // index(List<String>, int)
/** method to modify classpath at runtime */
private void addToClassPath(File file) throws IOException {
URLClassLoader sysloader =
(URLClassLoader)ClassLoader.getSystemClassLoader();
Class sysclass = URLClassLoader.class;
try {
Method method =
sysclass.getDeclaredMethod("addURL",new Class[]{URL.class});
method.setAccessible(true);
method.invoke(sysloader,new Object[]{ file.toURL() });
} catch (Throwable t) {
t.printStackTrace();
throw new IOException("Error, could not add URL to system classloader");
}//end try catch
}
/**
* returns date object in simple format
*
* @return
*/
private static String getDate() {
return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
.currentTimeMillis()));
}
// testing
public static void main(String[] args) {
if(args.length < 3) {
System.out.println("usage: SolrIndexer <index|delete|update|delByQuery>" +
" <solrUrl> <coreId> <fileUrl/query> [<fileUrl2> .... <fileUrlN>]");
System.exit(1);
}
// solr url
// e.g. http://localhost:8080/solr
SolrIndexer solr = new SolrIndexer(args[1]);
// if command succeded
boolean success = false;
// params
int coreId = Integer.parseInt(args[2]);
List<String> urls = new ArrayList<String>();
for(int i=3;i<args.length;i++) {
urls.add(args[i]);
}
if(args[0].equals("index")) {
// toIndexUrl = e.g. "file:///home/gate/work/gate-top/topdoc/index.html"
// coreId = e.g. 1, 2 etc.
if(urls.isEmpty()) {
success = false;
System.out.println("************Atleast one file url must be provided");
} else {
success = solr.index(urls, coreId);
}
} else if(args[0].equals("delete")) {
// toIndexUrl = e.g. "file:///home/gate/work/gate-top/topdoc/index.html"
// coreId = e.g. 1, 2 etc.
success = solr.delete(urls, coreId);
} else if(args[0].equals("update")) {
// toIndexUrl = e.g. "file:///home/gate/work/gate-top/topdoc/index.html"
// coreId = e.g. 1, 2 etc.
if(urls.isEmpty()) {
success = false;
System.out.println("Atleast one file url must be provided");
} else {
success = solr.update(urls, coreId);
}
} else if(args[0].equals("delByQuery")) {
success = solr.deleteByQuery(urls.get(0), coreId);
}
System.out.println("success : "+success);
if(!success)
System.exit(1);
else
System.exit(0);
}
} // SolrIndexer