1   /*
2    *  LuceneIndexManager.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Rosen Marinov, 19/Apr/2002
12   *
13   */
14  
15  package gate.creole.ir.lucene;
16  
17  import java.io.File;
18  import java.util.Iterator;
19  import java.util.List;
20  
21  import org.apache.lucene.analysis.SimpleAnalyzer;
22  import org.apache.lucene.document.Field;
23  import org.apache.lucene.index.IndexReader;
24  import org.apache.lucene.index.IndexWriter;
25  
26  import gate.Corpus;
27  import gate.creole.ir.*;
28  import gate.util.GateRuntimeException;
29  
30  /** This class represents Lucene implementation of IndexManeager interface.*/
31  public class LuceneIndexManager implements IndexManager{
32  
33    /** used in Lucene Documents as a key for gate document ID value. */
34    public final static String DOCUMENT_ID = "DOCUMENT_ID";
35  
36    /** IndexDefinition - location, type, fields, etc.*/
37    private IndexDefinition indexDefinition;
38  
39    /** An corpus for indexing*/
40    private Corpus corpus;
41  
42    /* Niraj */
43    /** constant that ensures that corpus is indexed with IR plugin */
44    public final static String CORPUS_INDEX_FEATURE = "CorpusIndexFeature";
45    public final static String CORPUS_INDEX_FEATURE_VALUE = "IR";
46    /* End */
47  
48    /** Constructor of the class. */
49    public LuceneIndexManager(){
50    }
51  
52    /** Creates index directory and indexing all
53     *  documents in the corpus. */
54    public void createIndex() throws IndexException{
55      if(indexDefinition == null)
56        throw new GateRuntimeException("Index definition is null!");
57      if(corpus == null)
58        throw new GateRuntimeException("Corpus is null!");
59  
60      String location = indexDefinition.getIndexLocation();
61      try {
62        File file = new File(location);
63        if (file.exists()){
64          if (file.isDirectory() && file.listFiles().length>0) {
65            throw new IndexException(location+ " is not empty directory");
66          }
67          if (!file.isDirectory()){
68            throw new IndexException("Only empty directory can be index path");
69          }
70        }
71  
72        /* Niraj */
73        // ok so lets put the corpus index feature
74        corpus.getFeatures().put(CORPUS_INDEX_FEATURE, CORPUS_INDEX_FEATURE_VALUE);
75        /* End */
76  
77        IndexWriter writer = new IndexWriter(location,
78                                             new SimpleAnalyzer(), true);
79  
80        for(int i = 0; i<corpus.size(); i++) {
81          boolean isLoaded = corpus.isDocumentLoaded(i);
82          gate.Document gateDoc = (gate.Document) corpus.get(i);
83          writer.addDocument(getLuceneDoc(gateDoc));
84          if (!isLoaded) {
85            corpus.unloadDocument(gateDoc);
86          }
87        }//for (all documents)
88  
89        writer.close();
90        corpus.sync();
91      } catch (java.io.IOException ioe){
92        throw new IndexException(ioe.getMessage());
93      } catch (gate.persist.PersistenceException pe){
94        pe.printStackTrace();
95      } catch (gate.security.SecurityException se){
96        se.printStackTrace();
97      }
98    }
99  
100   /** Optimize existing index. */
101   public void optimizeIndex() throws IndexException{
102     if(indexDefinition == null)
103       throw new GateRuntimeException("Index definition is null!");
104     try {
105       IndexWriter writer = new IndexWriter(indexDefinition.getIndexLocation(),
106                                          new SimpleAnalyzer(), false);
107       writer.optimize();
108       writer.close();
109     } catch (java.io.IOException ioe){
110       throw new IndexException(ioe.getMessage());
111     }
112   }
113 
114   /** Delete index. */
115   public void deleteIndex() throws IndexException{
116     if(indexDefinition == null)
117       throw new GateRuntimeException("Index definition is null!");
118     boolean isDeleted = true;
119     File dir = new File(indexDefinition.getIndexLocation());
120     if (dir.exists() && dir.isDirectory()) {
121       File[] files = dir.listFiles();
122       for (int i =0; i<files.length; i++){
123         File f = files[i];
124         isDeleted = f.delete();
125       }
126     }
127     dir.delete();
128     if (!isDeleted) {
129       throw new IndexException("Can't delete directory"
130                                + indexDefinition.getIndexLocation());
131     }
132   }
133 
134   /** Reindexing changed documents, removing removed documents and
135    *  add to the index new corpus documents. */
136   public void sync(List added, List removedIDs, List changed) throws IndexException{
137     String location = indexDefinition.getIndexLocation();
138     try {
139 
140       IndexReader reader = IndexReader.open(location);
141 
142       for (int i = 0; i<removedIDs.size(); i++) {
143         String id = removedIDs.get(i).toString();
144         org.apache.lucene.index.Term term =
145                                new org.apache.lucene.index.Term(DOCUMENT_ID,id);
146         reader.delete(term);
147       }//for (remove all removed documents)
148 
149       for (int i = 0; i<changed.size(); i++) {
150         gate.Document gateDoc = (gate.Document) changed.get(i);
151         String id = gateDoc.getLRPersistenceId().toString();
152         org.apache.lucene.index.Term term =
153                                new org.apache.lucene.index.Term(DOCUMENT_ID,id);
154         reader.delete(term);
155       }//for (remove all changed documents)
156 
157       reader.close();
158 
159       IndexWriter writer = new IndexWriter(location,
160                                           new SimpleAnalyzer(), false);
161 
162       for(int i = 0; i<added.size(); i++) {
163         gate.Document gateDoc = (gate.Document) added.get(i);
164         writer.addDocument(getLuceneDoc(gateDoc));
165       }//for (add all added documents)
166 
167       for(int i = 0; i<changed.size(); i++) {
168         gate.Document gateDoc = (gate.Document) changed.get(i);
169         writer.addDocument(getLuceneDoc(gateDoc));
170       }//for (add all changed documents)
171 
172       writer.close();
173     } catch (java.io.IOException ioe) {
174       throw new IndexException(ioe.getMessage());
175     }
176   }
177 
178   private org.apache.lucene.document.Document getLuceneDoc(gate.Document gateDoc){
179     org.apache.lucene.document.Document luceneDoc =
180                                      new org.apache.lucene.document.Document();
181     Iterator fields = indexDefinition.getIndexFields();
182 
183     luceneDoc.add(Field.Keyword(DOCUMENT_ID,
184                                 gateDoc.getLRPersistenceId().toString()));
185 
186     while (fields.hasNext()) {
187       IndexField field = (IndexField) fields.next();
188       String valueForIndexing;
189 
190       if (field.getReader() == null){
191         valueForIndexing = gateDoc.getFeatures().get(field.getName()).toString();
192       } else {
193         valueForIndexing = field.getReader().getPropertyValue(gateDoc);
194       } //if-else reader or feature
195 
196       if (field.isPreseved()) {
197         luceneDoc.add(Field.Keyword(field.getName(),valueForIndexing));
198       } else {
199         luceneDoc.add(Field.UnStored(field.getName(),valueForIndexing));
200       } // if-else keyword or text
201 
202     }// while (add all fields)
203 
204     return luceneDoc;
205   }
206 
207   public Corpus getCorpus() {
208     return corpus;
209   }
210   public void setCorpus(Corpus corpus) {
211     this.corpus = corpus;
212   }
213   public IndexDefinition getIndexDefinition() {
214     return indexDefinition;
215   }
216   public void setIndexDefinition(IndexDefinition indexDefinition) {
217     this.indexDefinition = indexDefinition;
218   }
219 
220 }