GATE.ac.uk - releases/gate-8.4-build5748-ALL/plugins/Format_MediaWiki/src/gate/corpora/MediaWikiPopulater.java

/*
 * MediaWikiPopulater.java
 *
 * Copyright (c) 2012-2013, The University of Sheffield. See the file COPYRIGHT.txt
 * in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * licenced under the GNU Library General Public License, Version 2, June 1991
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 *
 * Mark A. Greenwood, 01/08/2013
 */

package gate.corpora;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.gui.MainFrame;
import gate.gui.NameBearerHandle;
import gate.gui.ResourceHelper;
import gate.util.ExtensionFileFilter;
import info.bliki.wiki.dump.IArticleFilter;
import info.bliki.wiki.dump.Siteinfo;
import info.bliki.wiki.dump.WikiArticle;
import info.bliki.wiki.dump.WikiXMLParser;
import info.bliki.wiki.model.WikiModel;

import java.awt.event.ActionEvent;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import javax.swing.AbstractAction;
import javax.swing.Action;
import javax.swing.JFileChooser;

import org.xml.sax.SAXException;

@CreoleResource(name = "MediaWiki Corpus Populater", tool = true,
    autoinstances = @AutoInstance,
    comment = "Populate a corpus from a MediaWiki XML dump",
    helpURL = "http://gate.ac.uk/userguide/sec:creole:mediawiki")
public class MediaWikiPopulater extends ResourceHelper {

  private static final long serialVersionUID = -2505026690286047751L;

  /**
   * so that we don't end up with a document littered with unparsed
   * "magic words" we we need a custom model that we can use to filter them out
   */
  private final static WikiModel model = new WikiModel("${image}", "${title}") {
    @Override
    public String getRawWikiContent(String namespace, String articleName,
        Map<String, String> templateParameters) {
      String rawContent =
          super.getRawWikiContent(namespace, articleName, templateParameters);

      if(rawContent == null) {
        // if we return 'null' then the magic variables end up in the doc with
        // full markup which isn't really what we want, so we just return the
        // empty string instead to remove them entirely from the document
        return "";
      } else {
        return rawContent;
      }
    }
  };

  @SuppressWarnings("serial")
  @Override
  protected List<Action> buildActions(final NameBearerHandle handle) {
    List<Action> actions = new ArrayList<Action>();

    if(!(handle.getTarget() instanceof Corpus)) return actions;

    actions.add(new AbstractAction("Populate from MediaWiki XML Dump") {

      @Override
      public void actionPerformed(ActionEvent e) {

        // configure the file chooser ready for use
        final JFileChooser filer = MainFrame.getFileChooser();
        filer.setFileSelectionMode(JFileChooser.FILES_ONLY);
        filer.setDialogTitle("Select a MediaWiki XML Dump File");
        filer.resetChoosableFileFilters();
        filer.setAcceptAllFileFilterUsed(false);
        ExtensionFileFilter filter =
            new ExtensionFileFilter("MediaWiki XML Dump Files (*.xml)", "xml");
        filer.addChoosableFileFilter(filter);
        filer.setFileFilter(filter);

        // if no file was selected then just stop
        if(filer.showOpenDialog(MainFrame.getInstance()) != JFileChooser.APPROVE_OPTION)
          return;

        // we want to run the population in a separate thread so we don't lock
        // up the GUI
        Thread thread =
            new Thread(Thread.currentThread().getThreadGroup(),
                "MediaWiki XML Dump Corpus Populater") {
              public void run() {
                try {
                  populateCorpus((Corpus)handle.getTarget(), filer.getSelectedFile()
                      .toURI().toURL());
                } catch(MalformedURLException e) {
                  // this really should not be possible so just dump the
                  // exception and quit
                  e.printStackTrace();
                }
              }
            };
        thread.setPriority(Thread.MIN_PRIORITY);
        thread.start();
      }
    });

    return actions;
  }

  public static void populateCorpus(final Corpus corpus, URL xml) {
    try {
      // get the model ready for parsing
      model.setUp();

      // the parser needs an InputStream so lets build one up from the original
      // document content
      InputStream in = xml.openStream();

      // create a parser to load the XML document
      WikiXMLParser parser = new WikiXMLParser(in, new IArticleFilter() {

        @Override
        public void process(WikiArticle article, Siteinfo site)
            throws SAXException {

          try {
            // extract the page content and convert it to HTML
            // copy relevant metadata onto the document
            FeatureMap features = Factory.newFeatureMap();
            features.put("mediawiki.title", article.getTitle());
            features.put("mediawiki.timestamp", article.getTimeStamp());
            features.put("mediawiki.id", article.getId());
            features.put("mediawiki.revision", article.getRevisionId());
            features.put("mediawiki.sitename", site.getSitename());
            features.put("mediawiki.base", site.getBase());

            FeatureMap params = Factory.newFeatureMap();
            params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME,
                article.getText());
            params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME,
                "text/x-mediawiki");

            Document doc =
                (Document)Factory.createResource("gate.corpora.DocumentImpl",
                    params, features, article.getTitle());

            corpus.add(doc);

            if(corpus.getLRPersistenceId() != null) {
              // persistent corpus -> unload the document
              corpus.unloadDocument(doc);
              Factory.deleteResource(doc);
            }
          } catch(Exception e) {
            e.printStackTrace();
          }
        }
      });

      // now we are all set let's parse the MediaWiki XML file
      parser.parse();

      if(corpus.getDataStore() != null) {
        // if this corpus is in a datastore make sure we sync it back
        corpus.getDataStore().sync(corpus);
      }

    } catch(Exception e) {
      // oh dear, something went wrong and it's unlikely there is anything we
      // can do about it so let's just throw our hands in the air and pass the
      // responsibility up the stack and hope someone else will deal with it!
      throw new RuntimeException(e);
    } finally {
      // signal that, at least for now, we have finished with the model
      model.tearDown();
    }
  }
}