Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/translations/simplexmlreader

simplexmlreader.py (basic xmlreader for wikiimedia.org)

# -*- coding: utf-8  -*-
# 
"""
Each XmlEntry object represents a page, as read from an XML source

The XmlDump class reads a pages_current XML dump (like the ones offered on
http://dumps.wikimedia.org/wikipedia/de/) and offers a generator over
XmlEntry objects which can be used by other bots.

For fastest processing, XmlDump uses the cElementTree library 

Almost all code is copied from:
(C) Pywikipedia bot team, 2005-2010
__version__='$Id: xmlreader.py 9042 2011-03-13 10:14:47Z xqt $'
"""

#needed python module
from xml.etree.cElementTree import iterparse

class XmlEntry:
    """
    Represents a page.
    """
    def __init__(self, title, ns, text, redirect):
        self.title = title
        self.ns = ns
        self.text = text
        self.isredirect = redirect

class XmlHeaderEntry:
    """
    Represents a header entry
    """
    def __init__(self):
        self.sitename = u''
        self.base = u''
        self.generator = u''
        self.case = u''
        self.namespaces = {}

class XmlDump(object):
    """
    Represents an XML dump file. Reads the local file at initialization,
    parses it, and offers access to the resulting XmlEntries via a generator.

    Can read bz2,gz,7z or unzipped xml dump file from wikimedia.org
    
    Yields only latest revision and only:
    title, ns, text, redirect

    """
    def __init__(self, filename, allrevisions=False):
        self.filename = filename

    def parse(self):
        """Generator using cElementTree iterparse function"""
        if self.filename.endswith('.bz2'):
            import bz2
            source = bz2.BZ2File(self.filename)
        elif self.filename.endswith('.gz'):
            import gzip
            source = gzip.open(self.filename)
        elif self.filename.endswith('.7z'):
            import subprocess
            source = subprocess.Popen('7za e -bd -so %s 2>/dev/null'
                                      % self.filename,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      bufsize=65535).stdout
        else:
            # assume it's an uncompressed XML file
            source = open(self.filename)
        context = iterparse(source, events=("start", "end", "start-ns"))
        self.root = None

        for event, elem in context:
            if event == "start-ns" and elem[0] == "":
                self.uri = elem[1]
                continue
            if event == "start" and self.root is None:
                self.root = elem
                continue
            for rev in self.parse_only_latest(event, elem):
                yield rev

    def parse_only_latest(self, event, elem):
        """Parser that yields only the latest revision"""
        if event == "end" and elem.tag == "{%s}page" % self.uri:
            self._headers(elem)
            revision = elem.find("{%s}revision" % self.uri)
            yield self._create_revision(revision)
            elem.clear()
            self.root.clear()

    def _headers(self, elem):
        self.title = elem.findtext("{%s}title" % self.uri)
        self.ns = elem.findtext("{%s}ns" % self.uri)
        self.isredirect = elem.findtext("{%s}redirect" % self.uri) is not None


    def _create_revision(self, revision):
        """Creates a Single revision"""
        # could get comment, minor as well
        text = revision.findtext("{%s}text" % self.uri)
        return XmlEntry(title=self.title,
                        ns=self.ns,
                        text=text or u'',
                        redirect=self.isredirect
                       )