GATE.ac.uk - wiki/code-repository/src/sheffield/examples/InformationRetrievalApp.java

package sheffield.examples;

/*
 *  InformationRetrievalApp.java
 *
 *  Copyright (c) 1998-2003, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Marin Dimitrov, 27/Jan/2003
 *
 *  $Id: InformationRetrievalApp.java,v 1.4 2004/12/14 14:36:24 niraj Exp $
 */
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;

import gate.Corpus;
import gate.DataStore;
import gate.Document;
import gate.Factory;
import gate.Gate;
import gate.creole.Plugin;
import gate.creole.ResourceInstantiationException;
import gate.creole.ir.DefaultIndexDefinition;
import gate.creole.ir.DocumentContentReader;
import gate.creole.ir.IndexField;
import gate.creole.ir.IndexedCorpus;
import gate.creole.ir.QueryResult;
import gate.creole.ir.QueryResultList;
import gate.creole.ir.Search;
import gate.util.Err;
import gate.util.GateException;
import gate.util.Out;


public class InformationRetrievalApp {

  private static String INDEX_LOCATION = "c:/temp/lucene";
  //NOTE: the above path should NOT be URL style!
  private static String SERIAL_DATASTORE_PATH = "file:///c:/temp/gate_corpus";
  //the index folder must be EMPTY

  public InformationRetrievalApp() {
  }

  public static void main(String[] args) {

    InformationRetrievalApp irApp = new InformationRetrievalApp();

    //init GATE
    //    this is the first thing to be done
    try {
      Gate.init();
      Out.prln("GATE initialised...");
    }
    catch (GateException gex) {
      Err.prln("cannot initialise GATE...");
      gex.printStackTrace();
      return;
    }

    try{
      //load the IR plugin
      Gate.getCreoleRegister().registerPlugin(new Plugin.Maven("uk.ac.gate.plugins","information-retrieval","8.5"));
      
      //1. create and open a serial data store
      DataStore sds = Factory.createDataStore("gate.persist.SerialDataStore", SERIAL_DATASTORE_PATH);
      sds.open();

      //2. create transient corpus with 2 docs
      Corpus corpus = irApp.createTestCorpus();

      //3. serialize corpus in Serial datastore
      Corpus serialCorpus = (Corpus)sds.adopt(corpus);

      //4. sync datastore
      sds.sync(serialCorpus);

      //5. index the serialized corpus
      IndexedCorpus indexedCorpus = (IndexedCorpus)serialCorpus;

      //5.1. create Index definition that tells the IR engine how to index the corpus
      DefaultIndexDefinition did = new DefaultIndexDefinition();
      //5.1. use the Lucene IR engine (the only option at present)
      //    may be changed in the future
      did.setIrEngineClassName("gate.creole.ir.lucene.LuceneIREngine");
      //5.2. specify index location - this is different from the location of the serialized corpus
      did.setIndexLocation(INDEX_LOCATION);
      //5.3. specify fields to be indexed and their respective Readers
      //    if the field being indexed is document feature then set NULL as the default
      //    FieldReader would search the document features for a feature with the same name and will
      //    index its value (expected String)
      did.addIndexField(new IndexField("author", null, false));
      //    for the document content specify a predefined FieldReader called DocumentCOntentReader
      //    that will index the content of the document
      did.addIndexField(new IndexField("content", new DocumentContentReader(), false));
      //    any other things to be indexed such as custom annotations and their features
      //    ...require a custom FieldReader to be created and spceified in the same manner

      //5.4 finally tell the indexed corpus (that will be created) to use the above
      //    index definition
      indexedCorpus.setIndexDefinition(did);

      //5.5 ask the IndexManager to create the index (delete it beforehand if already existing)
      indexedCorpus.getIndexManager().deleteIndex();
      indexedCorpus.getIndexManager().createIndex();

      //now we have the two documents indexed and FTS queries may be specified for them
      //    using the respective IR manager (Lucene) search syntax

      //6. (optionally) optimize index
      //    with time indexes become suboptimal with additions/removal of new documents
      //    optimize the index from time to time for better performance
      //    on large indexes this will take some time since it involves index recreation
      indexedCorpus.getIndexManager().optimizeIndex();

      //7. search in index

      //7.2.  create the proper Search subclass
      //    since we're using the Lucene IR engine, use LuceneSearch
      Search search = (Search)Gate.getClassLoader().loadClass("gate.creole.ir.lucene.LuceneSearch").newInstance();
      //    the Search instance needs to know which corpus to search
      search.setCorpus(indexedCorpus);
      //    ...and the query to be performed
      String query = "+content:\"until there's a cure\" +author:foundation";
      //    ...this query looks for documents that has "author" field equal to "foundation"
      //    and contain the phrase "until there's a cure" in the content (the until.org page used
      //    for document2 is such)

      //7.3 execute query
      QueryResultList res = search.search(query);
      //7.4    ...and get results
      Iterator it = res.getQueryResults();
      //7.5 show results
      while (it.hasNext()) {
        QueryResult qr = (QueryResult) it.next();
        float score = qr.getScore();
        //the resultset contains (doc_id, relevance) pairs
        //    in order to get the real document, the corpus shoudl be used
        Document resultDoc = (Document) sds.getLr("gate.corpora.DocumentImpl", qr.getDocumentID());
        Out.prln("Query1: DOC_NAME=" + resultDoc.getName());
        Out.prln("Query1: score = " + score);
        Out.prln("Query1: author = " + resultDoc.getFeatures().get("author"));
        Out.prln("------------");
        //we expect just one document printed
      }

      //8. execute a second query
      String query2 = "+author:foundation";
      //    ...this query looks for documents that has "author" field equal to "foundation"

      //8.1 execute query
      QueryResultList res2 = search.search(query2);
      //8.2    ...and get results
      Iterator it2 = res2.getQueryResults();
      //8.3 show results
      while (it2.hasNext()) {
        QueryResult qr = (QueryResult) it2.next();
        float score = qr.getScore();
        //the resultset contains (doc_id, relevance) pairs
        //    in order to get the real document, the corpus shoudl be used
        Document resultDoc = (Document) sds.getLr("gate.corpora.DocumentImpl", qr.getDocumentID());
        Out.prln("Query2: DOC_NAME=" + resultDoc.getName());
        Out.prln("Query2: score = " + score);
        Out.prln("Query2: author = " + resultDoc.getFeatures().get("author"));
        Out.prln("------------");
        //we expect two documents printed
      }

      Out.prln("done...");

    }
    catch(Exception ex) {
      ex.printStackTrace(Err.getPrintWriter());
    }

  }

  public Corpus createTestCorpus()
    throws MalformedURLException, ResourceInstantiationException {

    Document doc1 = Factory.newDocument(new URL("http://www.wish.org/"));
    //add a dummy feature that will be indexed
    doc1.getFeatures().put("author","Make-A-Wish Foundation");
    doc1.setName("Make-A-Wish document");
    //Document doc1 = Factory.newDocument(new URL("file:///c:/temp/test.txt"));

    Document doc2 = Factory.newDocument(new URL("http://www.until.org"));
    //add a dummy feature that will be indexed
    doc2.getFeatures().put("author","Until There's A Cure Foundation");
    doc2.setName("until.org document");

    assert doc1!=null && doc2!=null;

    // create a corpus with the above documents
    Corpus result = Factory.newCorpus("test corpus");
    assert result != null;
    result.add(doc1);
    result.add(doc2);

    return result;
  }

}