Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/translations/simplexmlreader
simplexmlreader.py (basic xmlreader for wikiimedia.org)
# -*- coding: utf-8 -*- # """ Each XmlEntry object represents a page, as read from an XML source The XmlDump class reads a pages_current XML dump (like the ones offered on http://dumps.wikimedia.org/wikipedia/de/) and offers a generator over XmlEntry objects which can be used by other bots. For fastest processing, XmlDump uses the cElementTree library Almost all code is copied from: (C) Pywikipedia bot team, 2005-2010 __version__='$Id: xmlreader.py 9042 2011-03-13 10:14:47Z xqt $' """ #needed python module from xml.etree.cElementTree import iterparse class XmlEntry: """ Represents a page. """ def __init__(self, title, ns, text, redirect): self.title = title self.ns = ns self.text = text self.isredirect = redirect class XmlHeaderEntry: """ Represents a header entry """ def __init__(self): self.sitename = u'' self.base = u'' self.generator = u'' self.case = u'' self.namespaces = {} class XmlDump(object): """ Represents an XML dump file. Reads the local file at initialization, parses it, and offers access to the resulting XmlEntries via a generator. Can read bz2,gz,7z or unzipped xml dump file from wikimedia.org Yields only latest revision and only: title, ns, text, redirect """ def __init__(self, filename, allrevisions=False): self.filename = filename def parse(self): """Generator using cElementTree iterparse function""" if self.filename.endswith('.bz2'): import bz2 source = bz2.BZ2File(self.filename) elif self.filename.endswith('.gz'): import gzip source = gzip.open(self.filename) elif self.filename.endswith('.7z'): import subprocess source = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % self.filename, shell=True, stdout=subprocess.PIPE, bufsize=65535).stdout else: # assume it's an uncompressed XML file source = open(self.filename) context = iterparse(source, events=("start", "end", "start-ns")) self.root = None for event, elem in context: if event == "start-ns" and elem[0] == "": self.uri = elem[1] continue if event == "start" and self.root is None: self.root = elem continue for rev in self.parse_only_latest(event, elem): yield rev def parse_only_latest(self, event, elem): """Parser that yields only the latest revision""" if event == "end" and elem.tag == "{%s}page" % self.uri: self._headers(elem) revision = elem.find("{%s}revision" % self.uri) yield self._create_revision(revision) elem.clear() self.root.clear() def _headers(self, elem): self.title = elem.findtext("{%s}title" % self.uri) self.ns = elem.findtext("{%s}ns" % self.uri) self.isredirect = elem.findtext("{%s}redirect" % self.uri) is not None def _create_revision(self, revision): """Creates a Single revision""" # could get comment, minor as well text = revision.findtext("{%s}text" % self.uri) return XmlEntry(title=self.title, ns=self.ns, text=text or u'', redirect=self.isredirect )