Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/simplexmlreader.py

More simpler and faster reader.

  • Can be used instead of xmlreader if only ns, title and text are required from an xml dump and repetitive "querys" are done to the same xml dump).
  • Creates a file with positions of <title>. Needs *nix since it uses grep.
  • Downloads a fresh "pages-meta-current" dump if needed.
#!/usr/bin/python
# -*- coding: utf-8  -*-
#name: simplexmlreader.py
#v.0104


'''
Opens a dump for reading.
Downloads it if necessary.
Works only on: pages-meta-current
Requires:
1. Language of wiki project.
2. Which sister project to work on.
3. Path where dumps may exist.

Optional:
1. Check for newer dump.

Works for:
wikipedia
wikiversity
wikinews
wikisource
wikiquote
wikibooks
wiktionary

'''
import glob
import urllib2
import tarfile
import os
import subprocess

basebackupurl = 'http://dumps.wikimedia.org/'
kindofdump = u'-pages-meta-current.xml'

#defs for files to be used=====================================================
def titlesfilename(xmlfilename):return xmlfilename + '.titles'

def listofdumpsfiles(dumppath, wikilang, wikikind):
    #print dumppath , wikilang , wikikind , '-' + '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]', kindofdump
    #exit()
    return glob.glob(dumppath + wikilang + wikikind + '-' + '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'+ kindofdump)

def extractrawdate(xmlfilename):
    '''Extracts "rawdate" from a dump's filename 
    
    '''
    if len(xmlfilename):
        tmpfordate = xmlfilename[:-len(kindofdump)]
        return tmpfordate[tmpfordate.rfind('-')+1:]
    else:
        return '0'

def createnewertitlesfile(xmlfilename):
    '''Checks if a newer "title file" exists.
    
        If newer file does not exist means we have to create one.
        If no source exists (aka the xml dump) then return False.
    '''
    ftitles = titlesfilename(xmlfilename)
    if not os.path.exists(xmlfilename):
        print 'no source file...'
        return False
    if os.path.exists(ftitles):
        print '"titles" file exist. Checking dates...'
        if os.path.getmtime(xmlfilename) > os.path.getmtime(ftitles):
            print '"titles" file is old. Creating new...'
            f = open(ftitles, 'w') #using grep
            subprocess.call(['grep', '-b', '-E', '<title>.+</title>',  xmlfilename], stdout=f)
            return True
        else:
            print '"titles" file is ok.'
            return True 
    else:
        f = open(ftitles, 'w') #using grep
        subprocess.call(['grep', '-b', '-E', '<title>.+</title>',  xmlfilename], stdout=f)
        print '"titles" file created...'
        return True        
    print 'came here...'
    return False

class LatestDumpProps():
    def __init__(self):
        self.url = u''
        self.zippedfilename = u''
        self.rawdate = u''
        self.nonefound = True

#generator=====================================================================
class XmlEntry:
    """
    Represents a reduced page.
    
        We do not check for redirects but exists for compatibility.
    """
    def __init__(self, title, ns, text, redirect):
        self.title = title
        self.ns = ns
        self.text = text
        self.isredirect = redirect
    
class WikiDump():
    def __init__(self, wikilang, wikikind, pathtoxmldumps, checknewer = False, getlatestanyway = False):
        '''Constructor.
        
            First checks for existence of dump file.
            Downloads the latest if exists (and not in progress).
            
            self._ISOK holds "if all OK" (dump file existed or downloaded).
        '''
        #print wikilang, wikikind, pathtoxmldumps, checknewer , getlatestanyway 
        #exit()
        #iso code of wiki language
        self.wikilang = wikilang
        #path to wiki xml dump
        self.dumppath = pathtoxmldumps
        #sister project to work on
        if wikikind == u'wikipedia':
            self.wikikind = u'wiki'
        else:
            self.wikikind = wikikind
        #find if we have an xml dump
        _mydumpfiles = listofdumpsfiles(self.dumppath, self.wikilang, self.wikikind)         
        #print _mydumpfiles
        #exit()
        if len(_mydumpfiles):
            self.dumpfilenamepath = sorted(_mydumpfiles)[len(_mydumpfiles)-1] #if many found (bad work)
        else:
            self.dumpfilenamepath = u''
        if getlatestanyway:#user online and wants latest for some reason (ex. if the one here is damaged)
            self._ISOK = self._getandextractadump() 
        else:
            if len(_mydumpfiles):
                if checknewer:#user online, wants to check for newer
                    #TODO
                    print 'checking for newer....'                    
                    _existingrawdate = extractrawdate(self.dumpfilenamepath)
                    #check latest backup
                    _latestrawdate = self.urloflatestdump().rawdate
                    #if a newer exists then download that newer
                    if int(_latestrawdate) > int(_existingrawdate):
                        print 'downloading newer...'
                        #print 'latest rawdate:', _latestrawdate
                        #print 'existing rawdate:', _existingrawdate
                        #exit()
                        self._ISOK = self._getandextractadump()
                    else:
                        print 'we have the latest...'               
                else: #work on latest existing                                     
                    print 'working with latest existing...'          
                    self.dumpfilenamepath = sorted(_mydumpfiles)[len(_mydumpfiles)-1] #if many found (bad work)
                    
            else:
                #print 'no working dump. Getting latest...'
                #none exists here. Get latest dump.
                #It will also save the name in "self.dumpfilenamepath"
                self._ISOK = self._getandextractadump()            
        #print self.rawdate
        self._ISOK = True
        self.titlesfilename = titlesfilename(self.dumpfilenamepath)
        #will be created as named above
        #if not os.path.exists(self.titlesfilename):
        self._ISOK = createnewertitlesfile(self.dumpfilenamepath)
        print 'using... ', self.dumpfilenamepath        
    def workingdump_filenames(self):
        return self.dumpfilenamepath, self.titlesfilename
        
    def urloflatestdump(self):
        '''Returns url, filename and string-of-date (if not in progress)
        
        '''
        _wikitofind = self.wikilang + self.wikikind 
        _ldp = LatestDumpProps()     
        try:
            _response = urllib2.urlopen(basebackupurl + 'backup-index.html')
            lines = _response.read()
            _response.close()            
            for line in lines.split('\n'):
                #print line
                if _wikitofind in line[:]:
                    #print line
                    if '>Dump complete<' in line[:]:
                        _backupsubdir = line.split('a href=\"')[1].split('">')[0]
                        _ldp.rawdate = _backupsubdir.split('/')[1]
                        #realdate = datetime.date(int(rawdate[:4]), int(rawdate[4:6]), int(rawdate[6:8]))
                        #print realdate
                        _ldp.zippedfilename = _wikitofind + '-' + _ldp.rawdate + kindofdump + '.bz2'
                        _ldp.url = basebackupurl + _backupsubdir + '/' + _ldp.zippedfilename 
                        _ldp.nonefound = False
                        return _ldp
                    else:                       
                        return _ldp
        except IOError:
            return _ldp
    
    def _getandextractadump(self):
        '''Gets and extracts latest dump.
        
            Warning: Overwrites any excisting files.
            Backup if you want them.
            Can be used to force getting latest dump
        '''
        _ldp = self.urloflatestdump()
        if not _ldp.nonefound:
            #create file names and path of bz2 and xml files
            self.zFilenamePath = self.dumppath +  _ldp.zippedfilename
            self.dumpfilenamepath = self.zFilenamePath[:-4]            
            #remove old bz2
            if os.path.exists(self.zFilenamePath):
                try:
                    os.remove(self.zFilenamePath) 
                except:#oops...
                    print "Exception: ", str(sys.exc_info()) 
                    return False
            #else:
                #print 'File not found.'                          
            try:
                #prepare a new bz2 file
                with open(self.zFilenamePath, 'w') as zFile:
                    #get from internet bz2 file
                    zFile.write(urllib2.urlopen(_ldp.url).read())
                print 'Got bz2 file:', self.zFilenamePath  
                #unzip bz2 dump using bunzip2              
                funziped = open(self.dumpfilenamepath, 'w') 
                subprocess.call(['bunzip2', '-f', self.zFilenamePath], stdout=funziped) 
                #also source (.bz2 file) will be deleted                                              
                print 'Extracted:', self.zFilenamePath                
                return True
            except IOError:
                return False
        else:
            return False

    def parse(self):
        '''Yields articles from self.dumpfilenamepath
            
            Does not replaces '>' etc. nor converts to utf-8
            since article may not be used
            (ex. if only ns '10' is needed).
        '''
        fxml = open(self.dumpfilenamepath , 'r')
        ftitles = open(self.titlesfilename, 'r')
        start = long(u'0')
        title = u''
        newstart = long(u'0')
        newtitle = u''
        g = u''
        entryns = u''
        entrytitle = u''
        entrytext = u''
        for titlesline in ftitles:
            if start > 0:
                fxml.seek(start)
                newstart = long(titlesline[:long(titlesline.find(':'))])
                newtitle = titlesline.split('</title>')[0].split('<title>')[1]
                g=fxml.read(newstart-start)
                #print title
                if '<text xml:space="preserve" />' in g:                
                    entryns = g.split('<ns>')[1].split('</ns>')[0]
                    entrytitle = title
                    #can yield the entry if needed...
                    #if entryns == '10':
                        #print u'Το πρότυπο: ', entrytitle, ' είναι άδειο...........'
                        #print ':', textsplited[0],':'
                    title = newtitle
                    start = newstart
                    #do not yield it
                else:
                    textsplited = g.split('<text xml:space="preserve">')
                    entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0]
                    entrytitle = title
                    entrytext = textsplited[1].split('</text>')[0]
                    title = newtitle
                    start = newstart                   
                    yield XmlEntry( ns = entryns,
                            title = entrytitle,
                            text = entrytext,
                            redirect = ''
                            )
                    
            else:
                #print 'first line'
                start = long(titlesline[:long(titlesline.find(':'))])
                title = titlesline.split('</title>')[0].split('<title>')[1]
                #print 'first title: ', title, '--------------'
        #print 'lastline'
        fxml.seek(start)
        g = fxml.read(newstart)    
        fxml.close()
        ftitles.close()
        yield XmlEntry( ns = entryns,
                title = entrytitle,
                text = entrytext,
                redirect = ''
                )

'''test'''
def main():
    '''Test for a dump and create a "titles" file.
    
        Downloads a newer dump.
    '''
#Download a newer dump if any exists
    #b = WikiDump(u'el', u'wiktionary', u'/home/wiki/Λήψεις/', True)
    #print b._ISOK 
    
#just chek if exists and create a "titles" file.
    b = WikiDump(u'el', u'wikipedia', u'/media/FORMANY/wiki/dumps/')
    
    
if __name__=="__main__":
    main()