1   /*
2    *  LuceneIndexManager.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Rosen Marinov, 19/Apr/2002
12   *
13   */
14  
15  package gate.creole.ir.lucene;
16  
17  import gate.*;
18  import gate.util.*;
19  import gate.creole.ir.*;
20  
21  import org.apache.lucene.index.*;
22  import org.apache.lucene.analysis.*;
23  import org.apache.lucene.document.*;
24  import org.apache.lucene.search.*;
25  import org.apache.lucene.queryParser.*;
26  import org.apache.lucene.store.*;
27  
28  import java.io.*;
29  import java.util.*;
30  
31  /** This class represents Lucene implementation of IndexManeager interface.*/
32  public class LuceneIndexManager implements IndexManager{
33  
34    /** used in Lucene Documents as a key for gate document ID value. */
35    public final static String DOCUMENT_ID = "DOCUMENT_ID";
36  
37    /** IndexDefinition - location, type, fields, etc.*/
38    private IndexDefinition indexDefinition;
39  
40    /** An corpus for indexing*/
41    private Corpus corpus;
42  
43    /** Constructor of the class. */
44    public LuceneIndexManager(){
45    }
46  
47    /** Creates index directory and indexing all
48     *  documents in the corpus. */
49    public void createIndex() throws IndexException{
50      if(indexDefinition == null)
51        throw new GateRuntimeException("Index definition is null!");
52      if(corpus == null)
53        throw new GateRuntimeException("Corpus is null!");
54  
55      String location = indexDefinition.getIndexLocation();
56      try {
57        File file = new File(location);
58        if (file.exists()){
59          if (file.isDirectory() && file.listFiles().length>0) {
60            throw new IndexException(location+ " is not empty directory");
61          }
62          if (!file.isDirectory()){
63            throw new IndexException("Only empty directory can be index path");
64          }
65        }
66  
67        IndexWriter writer = new IndexWriter(location,
68                                             new SimpleAnalyzer(), true);
69  
70        for(int i = 0; i<corpus.size(); i++) {
71          boolean isLoaded = corpus.isDocumentLoaded(i);
72          gate.Document gateDoc = (gate.Document) corpus.get(i);
73          writer.addDocument(getLuceneDoc(gateDoc));
74          if (!isLoaded) {
75            corpus.unloadDocument(gateDoc);
76          }
77        }//for (all documents)
78  
79        writer.close();
80        corpus.sync();
81      } catch (java.io.IOException ioe){
82        throw new IndexException(ioe.getMessage());
83      } catch (gate.persist.PersistenceException pe){
84        pe.printStackTrace();
85      } catch (gate.security.SecurityException se){
86        se.printStackTrace();
87      }
88    }
89  
90    /** Optimize existing index. */
91    public void optimizeIndex() throws IndexException{
92      if(indexDefinition == null)
93        throw new GateRuntimeException("Index definition is null!");
94      try {
95        IndexWriter writer = new IndexWriter(indexDefinition.getIndexLocation(),
96                                           new SimpleAnalyzer(), false);
97        writer.optimize();
98        writer.close();
99      } catch (java.io.IOException ioe){
100       throw new IndexException(ioe.getMessage());
101     }
102   }
103 
104   /** Delete index. */
105   public void deleteIndex() throws IndexException{
106     if(indexDefinition == null)
107       throw new GateRuntimeException("Index definition is null!");
108     boolean isDeleted = true;
109     File dir = new File(indexDefinition.getIndexLocation());
110     if (dir.exists() && dir.isDirectory()) {
111       File[] files = dir.listFiles();
112       for (int i =0; i<files.length; i++){
113         File f = files[i];
114         isDeleted = f.delete();
115       }
116     }
117     dir.delete();
118     if (!isDeleted) {
119       throw new IndexException("Can't delete directory"
120                                + indexDefinition.getIndexLocation());
121     }
122   }
123 
124   /** Reindexing changed documents, removing removed documents and
125    *  add to the index new corpus documents. */
126   public void sync(List added, List removedIDs, List changed) throws IndexException{
127     String location = indexDefinition.getIndexLocation();
128     try {
129 
130       IndexReader reader = IndexReader.open(location);
131 
132       for (int i = 0; i<removedIDs.size(); i++) {
133         String id = removedIDs.get(i).toString();
134         org.apache.lucene.index.Term term =
135                                new org.apache.lucene.index.Term(DOCUMENT_ID,id);
136         reader.delete(term);
137       }//for (remove all removed documents)
138 
139       for (int i = 0; i<changed.size(); i++) {
140         gate.Document gateDoc = (gate.Document) changed.get(i);
141         String id = gateDoc.getLRPersistenceId().toString();
142         org.apache.lucene.index.Term term =
143                                new org.apache.lucene.index.Term(DOCUMENT_ID,id);
144         reader.delete(term);
145       }//for (remove all changed documents)
146 
147       reader.close();
148 
149       IndexWriter writer = new IndexWriter(location,
150                                           new SimpleAnalyzer(), false);
151 
152       for(int i = 0; i<added.size(); i++) {
153         gate.Document gateDoc = (gate.Document) added.get(i);
154         writer.addDocument(getLuceneDoc(gateDoc));
155       }//for (add all added documents)
156 
157       for(int i = 0; i<changed.size(); i++) {
158         gate.Document gateDoc = (gate.Document) changed.get(i);
159         writer.addDocument(getLuceneDoc(gateDoc));
160       }//for (add all changed documents)
161 
162       writer.close();
163     } catch (java.io.IOException ioe) {
164       throw new IndexException(ioe.getMessage());
165     }
166   }
167 
168   private org.apache.lucene.document.Document getLuceneDoc(gate.Document gateDoc){
169     org.apache.lucene.document.Document luceneDoc =
170                                      new org.apache.lucene.document.Document();
171     Iterator fields = indexDefinition.getIndexFields();
172 
173     luceneDoc.add(Field.Keyword(DOCUMENT_ID,
174                                 gateDoc.getLRPersistenceId().toString()));
175 
176     while (fields.hasNext()) {
177       IndexField field = (IndexField) fields.next();
178       String valueForIndexing;
179 
180       if (field.getReader() == null){
181         valueForIndexing = gateDoc.getFeatures().get(field.getName()).toString();
182       } else {
183         valueForIndexing = field.getReader().getPropertyValue(gateDoc);
184       } //if-else reader or feature
185 
186       if (field.isPreseved()) {
187         luceneDoc.add(Field.Keyword(field.getName(),valueForIndexing));
188       } else {
189         luceneDoc.add(Field.UnStored(field.getName(),valueForIndexing));
190       } // if-else keyword or text
191 
192     }// while (add all fields)
193 
194     return luceneDoc;
195   }
196 
197   public Corpus getCorpus() {
198     return corpus;
199   }
200   public void setCorpus(Corpus corpus) {
201     this.corpus = corpus;
202   }
203   public IndexDefinition getIndexDefinition() {
204     return indexDefinition;
205   }
206   public void setIndexDefinition(IndexDefinition indexDefinition) {
207     this.indexDefinition = indexDefinition;
208   }
209 
210 }