1   /*
2    *  CorpusSaver.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 22/Nov/2001
12   *
13   *  $Id: CorpusSaver.java,v 1.8 2003/07/10 13:53:43 kalina Exp $
14   */
15  
16  package gate.util;
17  
18  import java.util.*;
19  import java.io.*;
20  
21  import gate.*;
22  import gate.creole.*;
23  import gate.util.*;
24  import gate.persist.*;
25  import java.net.*;
26  import java.text.NumberFormat;
27  
28  import gate.creole.tokeniser.*;
29  import gate.creole.gazetteer.*;
30  import gate.creole.splitter.*;
31  import gate.creole.orthomatcher.*;
32  import gate.creole.annotransfer.*;
33  import gate.creole.annotdelete.*;
34  
35  public class CorpusSaver {
36  
37    private static final boolean DEBUG = true;
38  
39    public CorpusSaver() {
40    }
41  
42    public void init() {
43      if (saveMode) {
44        File path = new File(dsPath);
45        try {
46         ds = Factory.openDataStore("gate.persist.SerialDataStore",
47                                    path.toURL().toString());
48        } catch (Exception ex) {
49          throw new gate.util.GateRuntimeException(ex.getMessage());
50        }
51  
52        try {
53          Corpus corpus = Factory.newCorpus("bnc");
54          LanguageResource lr = ds.adopt(corpus, null);
55          ds.sync(lr);
56          theCorpus = (Corpus) lr;
57        } catch (Exception ex) {
58          throw new GateRuntimeException(ex.getMessage());
59        }
60      }
61  
62      if (processMode)
63        initPRs();
64  
65    }
66  
67    public void initPRs() {
68      try {
69        if (applicationFile == null)
70          Out.prln("Application not set!");
71        Out.prln("App file is: " + applicationFile.getAbsolutePath());
72        application = (Controller) gate.util.persistence.PersistenceManager
73                                     .loadObjectFromFile(applicationFile);
74      } catch (Exception ex) {
75        throw new GateRuntimeException("Corpus Saver: "+ex.getMessage());
76      }
77    }//initPRs
78  
79    public void execute() {
80      execute(startDir);
81      try {
82        if (saveMode) {
83          ds.sync(theCorpus);
84          Factory.deleteResource(theCorpus);
85          if (ds != null)
86            ds.close();
87        }
88        if (application != null) {
89          Iterator iter = new ArrayList(application.getPRs()).iterator();
90          while (iter.hasNext())
91            Factory.deleteResource((Resource) iter.next());
92        }
93      } catch (Exception ex) {
94        throw new GateRuntimeException(ex.getMessage());
95      }
96    }
97  
98    public void execute(File dir) {
99      if (dir == null || (saveMode && ds == null))
100       return;
101     //first set the current directory to be the given one
102     currDir = dir;
103     Out.prln("Processing directory: " + currDir);
104 
105     ArrayList files = new ArrayList();
106     ArrayList dirs = new ArrayList();
107     File[] dirArray = currDir.listFiles();
108     for (int i = 0; i < dirArray.length; i++) {
109       if (dirArray[i].isDirectory())
110         dirs.add(dirArray[i]);
111       else if (dirArray[i].isFile())
112         files.add(dirArray[i]);
113     }
114 
115     saveFiles(files);
116 
117     //if no more subdirs left, return
118     if (dirs.isEmpty())
119       return;
120 
121     //there are more subdirectories to traverse, so iterate through
122     for (int j = 0; j < dirs.size(); j++)
123       execute((File) dirs.get(j));
124 
125   }//execute(dir)
126 
127 
128   public static void main(String[] args) throws GateException {
129     Gate.init();
130 
131     CorpusSaver corpusSaver1 = new CorpusSaver();
132 
133     if(args.length < 2)
134       throw new GateException("usage: [-process|-process-only] source_directory datastore_path application");
135     int i = 0;
136     while (i < args.length && args[i].startsWith("-")) {
137       if(args[i].equals("-process")) {
138         Out.prln("Processing and saving the corpus enabled. <P>");
139         corpusSaver1.setProcessMode(true);
140       } else if (args[i].equals("-process_only")) {
141         Out.prln("Processing only enabled. <P>");
142         corpusSaver1.setSaveMode(false);
143         corpusSaver1.setProcessMode(true);
144       }
145       i++; //just ignore the option, which we do not recognise
146     }//while
147 
148     String dirName = args[i];
149     File dir = new File(dirName);
150     if (!dir.isDirectory())
151       throw new GateRuntimeException("Corpus directory should be "
152                                      + "provided as a parameter");
153 
154     if(i+1 >= args.length)
155       throw new GateRuntimeException("Datastore path not provided");
156 
157     if (corpusSaver1.getSaveMode()) {
158       String storagePath = args[i + 1];
159       File storage = new File(storagePath);
160       if (!storage.isDirectory())
161         throw new GateRuntimeException("Please provide path to an existing "
162                                        + "GATE serial datastore");
163       corpusSaver1.setDSPath(storagePath);
164     }
165 
166     //get the last argument which is the application
167     if (corpusSaver1.getProcessMode()) {
168       i++;
169       String appName = args[i];
170       File appFile = new File(appName);
171       if (!appFile.isFile())
172         throw new GateException("Please provide an existing GATE application");
173       else
174         corpusSaver1.setApplicationFile(appFile);
175     }
176 
177     Out.prln("Initialising GATE please wait...");
178     corpusSaver1.init();
179     corpusSaver1.setStartDir(dir);
180     Out.prln("Processing...");
181     double timeBefore = System.currentTimeMillis();
182     corpusSaver1.execute();
183     double timeAfter = System.currentTimeMillis();
184     Out.prln("Done in " +
185       NumberFormat.getInstance().format((timeAfter-timeBefore)/1000)
186       + " seconds");
187 
188   }
189 
190   public void setStartDir(File newDir) {
191     startDir = newDir;
192   }
193 
194   public void setProcessMode(boolean mode) {
195     processMode = mode;
196   }
197 
198   public boolean getProcessMode() {
199     return processMode;
200   }
201 
202   public void setSaveMode(boolean mode) {
203     saveMode = mode;
204   }
205 
206   public boolean getSaveMode() {
207     return saveMode;
208   }
209 
210   public void setDSPath(String path){
211     dsPath = path;
212   }
213 
214   public void setApplicationFile(File newAppFile) {
215     applicationFile = newAppFile;
216   }
217 
218 
219   protected void saveFiles(List files) {
220     if (files==null || files.isEmpty() ||
221         (saveMode && (theCorpus == null || ds == null)))
222       return;
223 
224     for(int i=0; i<files.size(); i++) {
225       try {
226         Document doc = Factory.newDocument(((File)files.get(i)).toURL());
227         doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURL().toString()));
228         Out.prln("Storing document: " + doc.getName());
229         //first process it with ANNIE if in process mode
230         if (processMode)
231           processDocument(doc);
232 
233         //then store it in the DS and add to corpus
234         if (saveMode) {
235           LanguageResource lr = ds.adopt(doc, null);
236           theCorpus.add(lr);
237           theCorpus.unloadDocument( (Document) lr);
238 
239           if (lr != doc)
240             Factory.deleteResource(lr);
241         }
242         Factory.deleteResource(doc);
243       } catch (Exception ex) {
244         throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage());
245       }
246     }//for
247   }//saveFiles
248 
249   protected void processDocument(Document doc) {
250     try {
251       if (application instanceof CorpusController) {
252         Corpus tempCorpus = Factory.newCorpus("temp");
253         tempCorpus.add(doc);
254         ((CorpusController)application).setCorpus(tempCorpus);
255         application.execute();
256         Factory.deleteResource(tempCorpus);
257         tempCorpus = null;
258       } else {
259         Iterator iter = application.getPRs().iterator();
260         while (iter.hasNext())
261           ((ProcessingResource) iter.next()).setParameterValue("document", doc);
262         application.execute();
263       }
264     } catch (ResourceInstantiationException ex) {
265       throw new RuntimeException("Error executing application: "
266                                     + ex.getMessage());
267     } catch (ExecutionException ex) {
268       throw new RuntimeException("Error executing application: "
269                                     + ex.getMessage());
270     }
271   }
272 
273 
274   /**
275    * The directory from which we should generate/evaluate the corpus
276    */
277   private File startDir;
278   private File currDir;
279 
280   private DataStore ds;
281   private Corpus theCorpus;
282   private String annotSetName = "NE";
283   private String dsPath = "d:\\bnc";
284   private Controller application = null;
285   private File applicationFile = null;
286 
287   private boolean processMode = false;
288   private boolean saveMode = true;
289 }
290