package sheffield.examples; /* * InformationRetrievalApp.java * * Copyright (c) 1998-2003, The University of Sheffield. * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * * Marin Dimitrov, 27/Jan/2003 * * $Id: InformationRetrievalApp.java,v 1.4 2004/12/14 14:36:24 niraj Exp $ */ import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; import gate.Corpus; import gate.DataStore; import gate.Document; import gate.Factory; import gate.Gate; import gate.creole.Plugin; import gate.creole.ResourceInstantiationException; import gate.creole.ir.DefaultIndexDefinition; import gate.creole.ir.DocumentContentReader; import gate.creole.ir.IndexField; import gate.creole.ir.IndexedCorpus; import gate.creole.ir.QueryResult; import gate.creole.ir.QueryResultList; import gate.creole.ir.Search; import gate.util.Err; import gate.util.GateException; import gate.util.Out; public class InformationRetrievalApp { private static String INDEX_LOCATION = "c:/temp/lucene"; //NOTE: the above path should NOT be URL style! private static String SERIAL_DATASTORE_PATH = "file:///c:/temp/gate_corpus"; //the index folder must be EMPTY public InformationRetrievalApp() { } public static void main(String[] args) { InformationRetrievalApp irApp = new InformationRetrievalApp(); //init GATE // this is the first thing to be done try { Gate.init(); Out.prln("GATE initialised..."); } catch (GateException gex) { Err.prln("cannot initialise GATE..."); gex.printStackTrace(); return; } try{ //load the IR plugin Gate.getCreoleRegister().registerPlugin(new Plugin.Maven("uk.ac.gate.plugins","information-retrieval","8.5")); //1. create and open a serial data store DataStore sds = Factory.createDataStore("gate.persist.SerialDataStore", SERIAL_DATASTORE_PATH); sds.open(); //2. create transient corpus with 2 docs Corpus corpus = irApp.createTestCorpus(); //3. serialize corpus in Serial datastore Corpus serialCorpus = (Corpus)sds.adopt(corpus); //4. sync datastore sds.sync(serialCorpus); //5. index the serialized corpus IndexedCorpus indexedCorpus = (IndexedCorpus)serialCorpus; //5.1. create Index definition that tells the IR engine how to index the corpus DefaultIndexDefinition did = new DefaultIndexDefinition(); //5.1. use the Lucene IR engine (the only option at present) // may be changed in the future did.setIrEngineClassName("gate.creole.ir.lucene.LuceneIREngine"); //5.2. specify index location - this is different from the location of the serialized corpus did.setIndexLocation(INDEX_LOCATION); //5.3. specify fields to be indexed and their respective Readers // if the field being indexed is document feature then set NULL as the default // FieldReader would search the document features for a feature with the same name and will // index its value (expected String) did.addIndexField(new IndexField("author", null, false)); // for the document content specify a predefined FieldReader called DocumentCOntentReader // that will index the content of the document did.addIndexField(new IndexField("content", new DocumentContentReader(), false)); // any other things to be indexed such as custom annotations and their features // ...require a custom FieldReader to be created and spceified in the same manner //5.4 finally tell the indexed corpus (that will be created) to use the above // index definition indexedCorpus.setIndexDefinition(did); //5.5 ask the IndexManager to create the index (delete it beforehand if already existing) indexedCorpus.getIndexManager().deleteIndex(); indexedCorpus.getIndexManager().createIndex(); //now we have the two documents indexed and FTS queries may be specified for them // using the respective IR manager (Lucene) search syntax //6. (optionally) optimize index // with time indexes become suboptimal with additions/removal of new documents // optimize the index from time to time for better performance // on large indexes this will take some time since it involves index recreation indexedCorpus.getIndexManager().optimizeIndex(); //7. search in index //7.2. create the proper Search subclass // since we're using the Lucene IR engine, use LuceneSearch Search search = (Search)Gate.getClassLoader().loadClass("gate.creole.ir.lucene.LuceneSearch").newInstance(); // the Search instance needs to know which corpus to search search.setCorpus(indexedCorpus); // ...and the query to be performed String query = "+content:\"until there's a cure\" +author:foundation"; // ...this query looks for documents that has "author" field equal to "foundation" // and contain the phrase "until there's a cure" in the content (the until.org page used // for document2 is such) //7.3 execute query QueryResultList res = search.search(query); //7.4 ...and get results Iterator it = res.getQueryResults(); //7.5 show results while (it.hasNext()) { QueryResult qr = (QueryResult) it.next(); float score = qr.getScore(); //the resultset contains (doc_id, relevance) pairs // in order to get the real document, the corpus shoudl be used Document resultDoc = (Document) sds.getLr("gate.corpora.DocumentImpl", qr.getDocumentID()); Out.prln("Query1: DOC_NAME=" + resultDoc.getName()); Out.prln("Query1: score = " + score); Out.prln("Query1: author = " + resultDoc.getFeatures().get("author")); Out.prln("------------"); //we expect just one document printed } //8. execute a second query String query2 = "+author:foundation"; // ...this query looks for documents that has "author" field equal to "foundation" //8.1 execute query QueryResultList res2 = search.search(query2); //8.2 ...and get results Iterator it2 = res2.getQueryResults(); //8.3 show results while (it2.hasNext()) { QueryResult qr = (QueryResult) it2.next(); float score = qr.getScore(); //the resultset contains (doc_id, relevance) pairs // in order to get the real document, the corpus shoudl be used Document resultDoc = (Document) sds.getLr("gate.corpora.DocumentImpl", qr.getDocumentID()); Out.prln("Query2: DOC_NAME=" + resultDoc.getName()); Out.prln("Query2: score = " + score); Out.prln("Query2: author = " + resultDoc.getFeatures().get("author")); Out.prln("------------"); //we expect two documents printed } Out.prln("done..."); } catch(Exception ex) { ex.printStackTrace(Err.getPrintWriter()); } } public Corpus createTestCorpus() throws MalformedURLException, ResourceInstantiationException { Document doc1 = Factory.newDocument(new URL("http://www.wish.org/")); //add a dummy feature that will be indexed doc1.getFeatures().put("author","Make-A-Wish Foundation"); doc1.setName("Make-A-Wish document"); //Document doc1 = Factory.newDocument(new URL("file:///c:/temp/test.txt")); Document doc2 = Factory.newDocument(new URL("http://www.until.org")); //add a dummy feature that will be indexed doc2.getFeatures().put("author","Until There's A Cure Foundation"); doc2.setName("until.org document"); assert doc1!=null && doc2!=null; // create a corpus with the above documents Corpus result = Factory.newCorpus("test corpus"); assert result != null; result.add(doc1); result.add(doc2); return result; } }