GATE.ac.uk - gatewiki/nutch-solr/src/java/gate/solr/SolrIndexer.java

/*
 *  SolrIndexer.java
 *  Copyright (c) 1998-2008, The University of Sheffield.
 *
 *  This code is from the GATE project (http://gate.ac.uk/) and is free
 *  software licenced under the GNU General Public License version 3. It is
 *  distributed without any warranty. For more details see COPYING.txt in the
 *  top level directory (or at http://gatewiki.sf.net/COPYING.txt).
 *
 *  Niraj Aswani 13 March 2009
 *
 */
package gate.solr;

import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.ArrayList;
import java.lang.reflect.*;
import java.io.*;
import java.net.*;

import org.apache.log4j.Logger;
import org.apache.nutch.crawl.Crawl;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
/**
 * This class is a thin layer over the nutch code that allows adding, deleting
 * and updating files in solr.
 */
public class SolrIndexer {

  /** Logger. */
  static Logger lgr = Logger.getLogger(SolrIndexer.class);

  /** URL of the server running solr index */
  String solrUrl;

  /** Construction. */
  public SolrIndexer(String solrUrl) {
    this.solrUrl = solrUrl;
    System.out.println(getClass().getClassLoader()
      .getResource("nutch-site.xml").toString());
    System.out.println(getClass().getClassLoader()
      .getResource("plugins").toString());
    System.out.println(getClass().getClassLoader()
      .getResource("urlfilter-default.txt").toString());
  } // SolrUpdater()

  /**
   * Removes the file from solr index
   * 
   * @param fileUrls
   *          A list of URLs of the files to be deleted
   * @param coreId
   *          core that holds the given file
   * @return true if file is deleted successfully, false otherwise
   */
  public boolean delete(List<String> fileUrls, int coreId) {

    // file to delete - this is the command that needs to be sent
    StringBuilder query = new StringBuilder();
    if(fileUrls.isEmpty()) {
      query.append("*:*");
    } else {
      for(int i=0;i<fileUrls.size();i++) {
        if(i != 0) query.append(" OR ");
        query.append("url:\"" + fileUrls.get(i) + "\"");
      }
    } 

    return deleteByQuery(query.toString(), coreId);
  } // delete(List<String>, int)


  /**
   * Removes documents that match the given query
   * 
   * @param query
   *          search query to be used for obtaining a list of documents
   * @param coreId
   *          core that holds the given file
   * @return true if file is deleted successfully, false otherwise
   */
  public boolean deleteByQuery(String query, int coreId) {

    try {

      // Send data
      URL url = new URL(solrUrl + "/core" + coreId + "/update");
      URLConnection conn = url.openConnection();
      conn.setDoOutput(true);
      conn.setRequestProperty("Content-type", "text/xml");
      conn.setRequestProperty("charset", "utf-8");
      OutputStreamWriter wr = new OutputStreamWriter(conn.getOutputStream());
      System.out.println("******************" + query);
      wr.write("<delete><query>"+query+"</query></delete>");
      wr.flush();
      
      // Get the response
      BufferedReader rd =
        new BufferedReader(new InputStreamReader(conn.getInputStream()));
      String line;
      while((line = rd.readLine()) != null) {
        System.out.println(line);
      }
      
      wr.close();
      rd.close();

      // ok open up the connection again, this time for commit command      
      conn = url.openConnection();
      conn.setDoOutput(true);
      conn.setRequestProperty("Content-type", "text/xml");
      conn.setRequestProperty("charset", "utf-8");
      wr = new OutputStreamWriter(conn.getOutputStream());
      wr.write("<commit/>");
      wr.flush();

      // receive the output
      rd =
        new BufferedReader(new InputStreamReader(conn.getInputStream()));
      while((line = rd.readLine()) != null) {
        System.out.println(line);
      }
      wr.close();
      rd.close();
      
    }
    catch(Exception e) {
      e.printStackTrace();
      return false;
    }

    // successful
    return true;
  } // deleteByQuery(String, int)


  /**
   * Removes documents that match the given query
   * 
   * @param query
   *          search query to be used for obtaining a list of documents
   * @param coreId
   *          core that holds the given file
   * @return true if file is deleted successfully, false otherwise
   */
  public boolean optimize(int coreId) {

    try {

      // Send data
      URL url = new URL(solrUrl + "/core" + coreId + "/update");
      URLConnection conn = url.openConnection();
      conn.setDoOutput(true);
      conn.setRequestProperty("Content-type", "text/xml");
      conn.setRequestProperty("charset", "utf-8");
      OutputStreamWriter wr = new OutputStreamWriter(conn.getOutputStream());
      System.out.println("******************optimizing");
      wr.write("<optimize/>");
      wr.flush();
      
      // Get the response
      BufferedReader rd =
        new BufferedReader(new InputStreamReader(conn.getInputStream()));
      String line;
      while((line = rd.readLine()) != null) {
        System.out.println(line);
      }
      
      wr.close();
      rd.close();
    }
    catch(Exception e) {
      e.printStackTrace();
      return false;
    }

    // successful
    return true;
  } // deleteByQuery(String, int)

  /**
   * Removes the file from solr index and then adds it using nutch
   * 
   * @param fileUrls
   *          A list of URLs of the files to be updated
   * @param coreId
   *          core that holds the given file
   * @return true if file is updated successfully, false otherwise
   */
  public boolean update(List<String> fileUrls, int coreId) {
    // delete it first
    if(!delete(fileUrls, coreId)) { return false; }

    // add it then
    index(fileUrls, coreId);
    
    return true;

  } // update(List<String>, int)

  /**
   * Adds a new file to solrIndexer
   * 
   * @param fileUrls
   *          a list of URLs of the files to be indexed
   * @param coreId
   *          core in which this file should be indexed
   * @return true if file is added successfully
   */
  public boolean index(final List<String> fileUrls, final int coreId) {
    System.out.println("Indexing started:");

    // lets create a temporary folder
    File indexDir = null;
    File urls = null;

    // by default depth is set to 1
    // use one thread only - multiple threads useful while indexing a wiki area
    String depth = "1";
    String threads = "1";
        
    try {
      indexDir = new File(System.getProperty("java.io.tmpdir"),
                           "crawl-"+ getDate());

      // create directory
      indexDir.mkdirs();

      // delete this folder on exit
      indexDir.deleteOnExit();

      // create a file with root url that lists fileUrl in it
      urls = new File(indexDir, "urls");


      BufferedWriter writer = new BufferedWriter(new FileWriter(urls));
      
      // filter string to be added to crawl-urlfilter.txt
      StringBuilder filteringRegexs = new StringBuilder();
      for(String aFileUrl : fileUrls) {
        writer.write(aFileUrl);

        if(depth.equals("1") && 
               new File(new URL(aFileUrl).toURI()).isDirectory()) {
          if(!aFileUrl.endsWith("/")) {
            aFileUrl += "/";
          }
          
          // removing file:// 
          aFileUrl = aFileUrl.substring(8);
          
          // asking it to index only html,text,doc and pdf files
          filteringRegexs.append("\n+").append(aFileUrl)
                  .append(".*\\.(html|htm|HTML|HTM|text|txt|doc|pdf|tex)$");
          // asking it to consider directories
          filteringRegexs.append("\n+").append(aFileUrl).append(".*/$");

          depth = "70";
          threads = "30";
        }

        // and the file itself that user has asked for indexing
        filteringRegexs.append("\n+").append(aFileUrl).append("$");
        writer.newLine();
      }
      writer.close();
      
      // need to copy default urlfilterings from urlfilter-default.txt
      // to craw-urlfilter.txt
      File crawlFilter = new File(indexDir, "crawl-urlfilter.txt");
      writer = new BufferedWriter(new FileWriter(crawlFilter));
      
      BufferedReader br = new BufferedReader(
            new InputStreamReader(getClass().getClassLoader()
            .getResourceAsStream("urlfilter-default.txt")));
      String line = br.readLine();
      while(line != null) {
        writer.write(line);
        writer.newLine();
        line = br.readLine();
      }
      
      // add rules for top-level directory
      writer.write(filteringRegexs.toString());
      writer.newLine();
      
      // exclude everything else
      writer.write("-.");
      writer.close();
      
      // add the indexDir on classpath
      addToClassPath(indexDir);
    }
    catch(IOException e1) {
      e1.printStackTrace();
      return false;
    }
    catch(URISyntaxException e1) {
      e1.printStackTrace();
      return false;
    }


    // nutch parameters
    String[] nutchParams =
      new String[]{urls.getAbsolutePath(), "-dir", indexDir.getAbsolutePath(),
        "-depth", depth, "-threads", threads};

    try {
      // crawl and index
      NutchCrawler.main(nutchParams);

      // find out number of segments
      File[] segmentDirectories = new File(indexDir, "segments").listFiles();
      String[] solrParams = new String[segmentDirectories.length + 3];

      // solr parameters
      solrParams[0] = solrUrl + "/core" + coreId;
      solrParams[1] = new File(indexDir, "crawldb").getAbsolutePath();
      solrParams[2] = new File(indexDir, "linkdb").getAbsolutePath();

      for(int i = 3; i < solrParams.length; i++) {
        solrParams[i] = segmentDirectories[i - 3].getAbsolutePath();
      }

      // add it to solr
      ToolRunner.run(NutchConfiguration.create(),
        new org.apache.nutch.indexer.solr.SolrIndexer(), solrParams);
 
      // delete directory listings
      if(fileUrls != null && fileUrls.size() > 0) {
        String query = 
              new File(new URL(fileUrls.get(0)).toURI()).getAbsolutePath();
        deleteByQuery(query, coreId);
        optimize(coreId);
      }
   }
    catch(Exception e) {
      e.printStackTrace();
      return false;
    }
        
    // return normally
    return true;
  } // index(List<String>, int)

  /** method to modify classpath at runtime */
  private void addToClassPath(File file) throws IOException {
    URLClassLoader sysloader = 
                (URLClassLoader)ClassLoader.getSystemClassLoader();
	  Class sysclass = URLClassLoader.class;
 
	  try {
		  Method method = 
		         sysclass.getDeclaredMethod("addURL",new Class[]{URL.class});
		  method.setAccessible(true);
		  method.invoke(sysloader,new Object[]{ file.toURL() });
	  } catch (Throwable t) {
		  t.printStackTrace();
		  throw new IOException("Error, could not add URL to system classloader");
	  }//end try catch
  }
  
  /**
   * returns date object in simple format
   * 
   * @return
   */
  private static String getDate() {
    return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
      .currentTimeMillis()));
  }

  // testing
  public static void main(String[] args) {

    if(args.length < 3) {
      System.out.println("usage: SolrIndexer <index|delete|update|delByQuery>" +
            " <solrUrl> <coreId> <fileUrl/query> [<fileUrl2> .... <fileUrlN>]");
      System.exit(1);
    }
    
    // solr url
    // e.g. http://localhost:8080/solr
    SolrIndexer solr = new SolrIndexer(args[1]);

    // if command succeded 
    boolean success = false;
    
    // params
    int coreId = Integer.parseInt(args[2]);
    List<String> urls = new ArrayList<String>();
    for(int i=3;i<args.length;i++) {
      urls.add(args[i]);
    }
    
    
    if(args[0].equals("index")) {
      // toIndexUrl = e.g. "file:///home/gate/work/gate-top/topdoc/index.html"
      // coreId = e.g. 1, 2 etc.
      if(urls.isEmpty()) {
        success = false;
        System.out.println("************Atleast one file url must be provided");
      } else {
        success = solr.index(urls, coreId);
      }
    } else if(args[0].equals("delete")) {
      // toIndexUrl = e.g. "file:///home/gate/work/gate-top/topdoc/index.html"
      // coreId = e.g. 1, 2 etc.
      success = solr.delete(urls, coreId);
    } else if(args[0].equals("update")) {
      // toIndexUrl = e.g. "file:///home/gate/work/gate-top/topdoc/index.html"
      // coreId = e.g. 1, 2 etc.
      if(urls.isEmpty()) {
        success = false;
        System.out.println("Atleast one file url must be provided");
      } else {
        success = solr.update(urls, coreId);
      }
    } else if(args[0].equals("delByQuery")) {
      success = solr.deleteByQuery(urls.get(0), coreId);
    }
    
    System.out.println("success : "+success);
    
    if(!success)
      System.exit(1);
    else
      System.exit(0);
  }
} // SolrIndexer