Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/simplexmlreader.py
(Ανακατεύθυνση από Χρήστης:Xoristzatziki/scripts/simplexmlreader.py)
More simpler and faster reader.
- Can be used instead of xmlreader if only ns, title and text are required from an xml dump and repetitive "querys" are done to the same xml dump).
- Creates a file with positions of <title>. Needs *nix since it uses grep.
- Downloads a fresh "pages-meta-current" dump if needed.
#!/usr/bin/python # -*- coding: utf-8 -*- #name: simplexmlreader.py #v.0104 ''' Opens a dump for reading. Downloads it if necessary. Works only on: pages-meta-current Requires: 1. Language of wiki project. 2. Which sister project to work on. 3. Path where dumps may exist. Optional: 1. Check for newer dump. Works for: wikipedia wikiversity wikinews wikisource wikiquote wikibooks wiktionary ''' import glob import urllib2 import tarfile import os import subprocess basebackupurl = 'http://dumps.wikimedia.org/' kindofdump = u'-pages-meta-current.xml' #defs for files to be used===================================================== def titlesfilename(xmlfilename):return xmlfilename + '.titles' def listofdumpsfiles(dumppath, wikilang, wikikind): #print dumppath , wikilang , wikikind , '-' + '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]', kindofdump #exit() return glob.glob(dumppath + wikilang + wikikind + '-' + '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'+ kindofdump) def extractrawdate(xmlfilename): '''Extracts "rawdate" from a dump's filename ''' if len(xmlfilename): tmpfordate = xmlfilename[:-len(kindofdump)] return tmpfordate[tmpfordate.rfind('-')+1:] else: return '0' def createnewertitlesfile(xmlfilename): '''Checks if a newer "title file" exists. If newer file does not exist means we have to create one. If no source exists (aka the xml dump) then return False. ''' ftitles = titlesfilename(xmlfilename) if not os.path.exists(xmlfilename): print 'no source file...' return False if os.path.exists(ftitles): print '"titles" file exist. Checking dates...' if os.path.getmtime(xmlfilename) > os.path.getmtime(ftitles): print '"titles" file is old. Creating new...' f = open(ftitles, 'w') #using grep subprocess.call(['grep', '-b', '-E', '<title>.+</title>', xmlfilename], stdout=f) return True else: print '"titles" file is ok.' return True else: f = open(ftitles, 'w') #using grep subprocess.call(['grep', '-b', '-E', '<title>.+</title>', xmlfilename], stdout=f) print '"titles" file created...' return True print 'came here...' return False class LatestDumpProps(): def __init__(self): self.url = u'' self.zippedfilename = u'' self.rawdate = u'' self.nonefound = True #generator===================================================================== class XmlEntry: """ Represents a reduced page. We do not check for redirects but exists for compatibility. """ def __init__(self, title, ns, text, redirect): self.title = title self.ns = ns self.text = text self.isredirect = redirect class WikiDump(): def __init__(self, wikilang, wikikind, pathtoxmldumps, checknewer = False, getlatestanyway = False): '''Constructor. First checks for existence of dump file. Downloads the latest if exists (and not in progress). self._ISOK holds "if all OK" (dump file existed or downloaded). ''' #print wikilang, wikikind, pathtoxmldumps, checknewer , getlatestanyway #exit() #iso code of wiki language self.wikilang = wikilang #path to wiki xml dump self.dumppath = pathtoxmldumps #sister project to work on if wikikind == u'wikipedia': self.wikikind = u'wiki' else: self.wikikind = wikikind #find if we have an xml dump _mydumpfiles = listofdumpsfiles(self.dumppath, self.wikilang, self.wikikind) #print _mydumpfiles #exit() if len(_mydumpfiles): self.dumpfilenamepath = sorted(_mydumpfiles)[len(_mydumpfiles)-1] #if many found (bad work) else: self.dumpfilenamepath = u'' if getlatestanyway:#user online and wants latest for some reason (ex. if the one here is damaged) self._ISOK = self._getandextractadump() else: if len(_mydumpfiles): if checknewer:#user online, wants to check for newer #TODO print 'checking for newer....' _existingrawdate = extractrawdate(self.dumpfilenamepath) #check latest backup _latestrawdate = self.urloflatestdump().rawdate #if a newer exists then download that newer if int(_latestrawdate) > int(_existingrawdate): print 'downloading newer...' #print 'latest rawdate:', _latestrawdate #print 'existing rawdate:', _existingrawdate #exit() self._ISOK = self._getandextractadump() else: print 'we have the latest...' else: #work on latest existing print 'working with latest existing...' self.dumpfilenamepath = sorted(_mydumpfiles)[len(_mydumpfiles)-1] #if many found (bad work) else: #print 'no working dump. Getting latest...' #none exists here. Get latest dump. #It will also save the name in "self.dumpfilenamepath" self._ISOK = self._getandextractadump() #print self.rawdate self._ISOK = True self.titlesfilename = titlesfilename(self.dumpfilenamepath) #will be created as named above #if not os.path.exists(self.titlesfilename): self._ISOK = createnewertitlesfile(self.dumpfilenamepath) print 'using... ', self.dumpfilenamepath def workingdump_filenames(self): return self.dumpfilenamepath, self.titlesfilename def urloflatestdump(self): '''Returns url, filename and string-of-date (if not in progress) ''' _wikitofind = self.wikilang + self.wikikind _ldp = LatestDumpProps() try: _response = urllib2.urlopen(basebackupurl + 'backup-index.html') lines = _response.read() _response.close() for line in lines.split('\n'): #print line if _wikitofind in line[:]: #print line if '>Dump complete<' in line[:]: _backupsubdir = line.split('a href=\"')[1].split('">')[0] _ldp.rawdate = _backupsubdir.split('/')[1] #realdate = datetime.date(int(rawdate[:4]), int(rawdate[4:6]), int(rawdate[6:8])) #print realdate _ldp.zippedfilename = _wikitofind + '-' + _ldp.rawdate + kindofdump + '.bz2' _ldp.url = basebackupurl + _backupsubdir + '/' + _ldp.zippedfilename _ldp.nonefound = False return _ldp else: return _ldp except IOError: return _ldp def _getandextractadump(self): '''Gets and extracts latest dump. Warning: Overwrites any excisting files. Backup if you want them. Can be used to force getting latest dump ''' _ldp = self.urloflatestdump() if not _ldp.nonefound: #create file names and path of bz2 and xml files self.zFilenamePath = self.dumppath + _ldp.zippedfilename self.dumpfilenamepath = self.zFilenamePath[:-4] #remove old bz2 if os.path.exists(self.zFilenamePath): try: os.remove(self.zFilenamePath) except:#oops... print "Exception: ", str(sys.exc_info()) return False #else: #print 'File not found.' try: #prepare a new bz2 file with open(self.zFilenamePath, 'w') as zFile: #get from internet bz2 file zFile.write(urllib2.urlopen(_ldp.url).read()) print 'Got bz2 file:', self.zFilenamePath #unzip bz2 dump using bunzip2 funziped = open(self.dumpfilenamepath, 'w') subprocess.call(['bunzip2', '-f', self.zFilenamePath], stdout=funziped) #also source (.bz2 file) will be deleted print 'Extracted:', self.zFilenamePath return True except IOError: return False else: return False def parse(self): '''Yields articles from self.dumpfilenamepath Does not replaces '>' etc. nor converts to utf-8 since article may not be used (ex. if only ns '10' is needed). ''' fxml = open(self.dumpfilenamepath , 'r') ftitles = open(self.titlesfilename, 'r') start = long(u'0') title = u'' newstart = long(u'0') newtitle = u'' g = u'' entryns = u'' entrytitle = u'' entrytext = u'' for titlesline in ftitles: if start > 0: fxml.seek(start) newstart = long(titlesline[:long(titlesline.find(':'))]) newtitle = titlesline.split('</title>')[0].split('<title>')[1] g=fxml.read(newstart-start) #print title if '<text xml:space="preserve" />' in g: entryns = g.split('<ns>')[1].split('</ns>')[0] entrytitle = title #can yield the entry if needed... #if entryns == '10': #print u'Το πρότυπο: ', entrytitle, ' είναι άδειο...........' #print ':', textsplited[0],':' title = newtitle start = newstart #do not yield it else: textsplited = g.split('<text xml:space="preserve">') entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0] entrytitle = title entrytext = textsplited[1].split('</text>')[0] title = newtitle start = newstart yield XmlEntry( ns = entryns, title = entrytitle, text = entrytext, redirect = '' ) else: #print 'first line' start = long(titlesline[:long(titlesline.find(':'))]) title = titlesline.split('</title>')[0].split('<title>')[1] #print 'first title: ', title, '--------------' #print 'lastline' fxml.seek(start) g = fxml.read(newstart) fxml.close() ftitles.close() yield XmlEntry( ns = entryns, title = entrytitle, text = entrytext, redirect = '' ) '''test''' def main(): '''Test for a dump and create a "titles" file. Downloads a newer dump. ''' #Download a newer dump if any exists #b = WikiDump(u'el', u'wiktionary', u'/home/wiki/Λήψεις/', True) #print b._ISOK #just chek if exists and create a "titles" file. b = WikiDump(u'el', u'wikipedia', u'/media/FORMANY/wiki/dumps/') if __name__=="__main__": main()