Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/simplexmlreader.py

More simpler and faster reader.
Can be used instead of xmlreader if only ns, title and text are required from an xml dump and repetitive "querys" are done to the same xml dump).
Creates a file with positions of <title>. Needs *nix since it uses grep.
Downloads a fresh "pages-meta-current" dump if needed.
#!/usr/bin/python
# -*- coding: utf-8  -*-
#name: simplexmlreader.py
#v.0104


'''
Opens a dump for reading.
Downloads it if necessary.
Works only on: pages-meta-current
Requires:
1. Language of wiki project.
2. Which sister project to work on.
3. Path where dumps may exist.

Optional:
1. Check for newer dump.

Works for:
wikipedia
wikiversity
wikinews
wikisource
wikiquote
wikibooks
wiktionary

'''
import glob
import urllib2
import tarfile
import os
import subprocess

basebackupurl = 'http://dumps.wikimedia.org/'
kindofdump = u'-pages-meta-current.xml'

#defs for files to be used=====================================================
def titlesfilename(xmlfilename):return xmlfilename + '.titles'

def listofdumpsfiles(dumppath, wikilang, wikikind):
    #print dumppath , wikilang , wikikind , '-' + '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]', kindofdump
    #exit()
    return glob.glob(dumppath + wikilang + wikikind + '-' + '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'+ kindofdump)

def extractrawdate(xmlfilename):
    '''Extracts "rawdate" from a dump's filename 
    
    '''
    if len(xmlfilename):
        tmpfordate = xmlfilename[:-len(kindofdump)]
        return tmpfordate[tmpfordate.rfind('-')+1:]
    else:
        return '0'

def createnewertitlesfile(xmlfilename):
    '''Checks if a newer "title file" exists.
    
        If newer file does not exist means we have to create one.
        If no source exists (aka the xml dump) then return False.
    '''
    ftitles = titlesfilename(xmlfilename)
    if not os.path.exists(xmlfilename):
        print 'no source file...'
        return False
    if os.path.exists(ftitles):
        print '"titles" file exist. Checking dates...'
        if os.path.getmtime(xmlfilename) > os.path.getmtime(ftitles):
            print '"titles" file is old. Creating new...'
            f = open(ftitles, 'w') #using grep
            subprocess.call(['grep', '-b', '-E', '<title>.+</title>',  xmlfilename], stdout=f)
            return True
        else:
            print '"titles" file is ok.'
            return True 
    else:
        f = open(ftitles, 'w') #using grep
        subprocess.call(['grep', '-b', '-E', '<title>.+</title>',  xmlfilename], stdout=f)
        print '"titles" file created...'
        return True        
    print 'came here...'
    return False

class LatestDumpProps():
    def __init__(self):
        self.url = u''
        self.zippedfilename = u''
        self.rawdate = u''
        self.nonefound = True

#generator=====================================================================
class XmlEntry:
    """
    Represents a reduced page.
    
        We do not check for redirects but exists for compatibility.
    """
    def __init__(self, title, ns, text, redirect):
        self.title = title
        self.ns = ns
        self.text = text
        self.isredirect = redirect
    
class WikiDump():
    def __init__(self, wikilang, wikikind, pathtoxmldumps, checknewer = False, getlatestanyway = False):
        '''Constructor.
        
            First checks for existence of dump file.
            Downloads the latest if exists (and not in progress).
            
            self._ISOK holds "if all OK" (dump file existed or downloaded).
        '''
        #print wikilang, wikikind, pathtoxmldumps, checknewer , getlatestanyway 
        #exit()
        #iso code of wiki language
        self.wikilang = wikilang
        #path to wiki xml dump
        self.dumppath = pathtoxmldumps
        #sister project to work on
        if wikikind == u'wikipedia':
            self.wikikind = u'wiki'
        else:
            self.wikikind = wikikind
        #find if we have an xml dump
        _mydumpfiles = listofdumpsfiles(self.dumppath, self.wikilang, self.wikikind)         
        #print _mydumpfiles
        #exit()
        if len(_mydumpfiles):
            self.dumpfilenamepath = sorted(_mydumpfiles)[len(_mydumpfiles)-1] #if many found (bad work)
        else:
            self.dumpfilenamepath = u''
        if getlatestanyway:#user online and wants latest for some reason (ex. if the one here is damaged)
            self._ISOK = self._getandextractadump() 
        else:
            if len(_mydumpfiles):
                if checknewer:#user online, wants to check for newer
                    #TODO
                    print 'checking for newer....'                    
                    _existingrawdate = extractrawdate(self.dumpfilenamepath)
                    #check latest backup
                    _latestrawdate = self.urloflatestdump().rawdate
                    #if a newer exists then download that newer
                    if int(_latestrawdate) > int(_existingrawdate):
                        print 'downloading newer...'
                        #print 'latest rawdate:', _latestrawdate
                        #print 'existing rawdate:', _existingrawdate
                        #exit()
                        self._ISOK = self._getandextractadump()
                    else:
                        print 'we have the latest...'               
                else: #work on latest existing                                     
                    print 'working with latest existing...'          
                    self.dumpfilenamepath = sorted(_mydumpfiles)[len(_mydumpfiles)-1] #if many found (bad work)
                    
            else:
                #print 'no working dump. Getting latest...'
                #none exists here. Get latest dump.
                #It will also save the name in "self.dumpfilenamepath"
                self._ISOK = self._getandextractadump()            
        #print self.rawdate
        self._ISOK = True
        self.titlesfilename = titlesfilename(self.dumpfilenamepath)
        #will be created as named above
        #if not os.path.exists(self.titlesfilename):
        self._ISOK = createnewertitlesfile(self.dumpfilenamepath)
        print 'using... ', self.dumpfilenamepath        
    def workingdump_filenames(self):
        return self.dumpfilenamepath, self.titlesfilename
        
    def urloflatestdump(self):
        '''Returns url, filename and string-of-date (if not in progress)
        
        '''
        _wikitofind = self.wikilang + self.wikikind 
        _ldp = LatestDumpProps()     
        try:
            _response = urllib2.urlopen(basebackupurl + 'backup-index.html')
            lines = _response.read()
            _response.close()            
            for line in lines.split('\n'):
                #print line
                if _wikitofind in line[:]:
                    #print line
                    if '>Dump complete<' in line[:]:
                        _backupsubdir = line.split('a href=\"')[1].split('">')[0]
                        _ldp.rawdate = _backupsubdir.split('/')[1]
                        #realdate = datetime.date(int(rawdate[:4]), int(rawdate[4:6]), int(rawdate[6:8]))
                        #print realdate
                        _ldp.zippedfilename = _wikitofind + '-' + _ldp.rawdate + kindofdump + '.bz2'
                        _ldp.url = basebackupurl + _backupsubdir + '/' + _ldp.zippedfilename 
                        _ldp.nonefound = False
                        return _ldp
                    else:                       
                        return _ldp
        except IOError:
            return _ldp
    
    def _getandextractadump(self):
        '''Gets and extracts latest dump.
        
            Warning: Overwrites any excisting files.
            Backup if you want them.
            Can be used to force getting latest dump
        '''
        _ldp = self.urloflatestdump()
        if not _ldp.nonefound:
            #create file names and path of bz2 and xml files
            self.zFilenamePath = self.dumppath +  _ldp.zippedfilename
            self.dumpfilenamepath = self.zFilenamePath[:-4]            
            #remove old bz2
            if os.path.exists(self.zFilenamePath):
                try:
                    os.remove(self.zFilenamePath) 
                except:#oops...
                    print "Exception: ", str(sys.exc_info()) 
                    return False
            #else:
                #print 'File not found.'                          
            try:
                #prepare a new bz2 file
                with open(self.zFilenamePath, 'w') as zFile:
                    #get from internet bz2 file
                    zFile.write(urllib2.urlopen(_ldp.url).read())
                print 'Got bz2 file:', self.zFilenamePath  
                #unzip bz2 dump using bunzip2              
                funziped = open(self.dumpfilenamepath, 'w') 
                subprocess.call(['bunzip2', '-f', self.zFilenamePath], stdout=funziped) 
                #also source (.bz2 file) will be deleted                                              
                print 'Extracted:', self.zFilenamePath                
                return True
            except IOError:
                return False
        else:
            return False

    def parse(self):
        '''Yields articles from self.dumpfilenamepath
            
            Does not replaces '>' etc. nor converts to utf-8
            since article may not be used
            (ex. if only ns '10' is needed).
        '''
        fxml = open(self.dumpfilenamepath , 'r')
        ftitles = open(self.titlesfilename, 'r')
        start = long(u'0')
        title = u''
        newstart = long(u'0')
        newtitle = u''
        g = u''
        entryns = u''
        entrytitle = u''
        entrytext = u''
        for titlesline in ftitles:
            if start > 0:
                fxml.seek(start)
                newstart = long(titlesline[:long(titlesline.find(':'))])
                newtitle = titlesline.split('</title>')[0].split('<title>')[1]
                g=fxml.read(newstart-start)
                #print title
                if '<text xml:space="preserve" />' in g:                
                    entryns = g.split('<ns>')[1].split('</ns>')[0]
                    entrytitle = title
                    #can yield the entry if needed...
                    #if entryns == '10':
                        #print u'Το πρότυπο: ', entrytitle, ' είναι άδειο...........'
                        #print ':', textsplited[0],':'
                    title = newtitle
                    start = newstart
                    #do not yield it
                else:
                    textsplited = g.split('<text xml:space="preserve">')
                    entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0]
                    entrytitle = title
                    entrytext = textsplited[1].split('</text>')[0]
                    title = newtitle
                    start = newstart                   
                    yield XmlEntry( ns = entryns,
                            title = entrytitle,
                            text = entrytext,
                            redirect = ''
                            )
                    
            else:
                #print 'first line'
                start = long(titlesline[:long(titlesline.find(':'))])
                title = titlesline.split('</title>')[0].split('<title>')[1]
                #print 'first title: ', title, '--------------'
        #print 'lastline'
        fxml.seek(start)
        g = fxml.read(newstart)    
        fxml.close()
        ftitles.close()
        yield XmlEntry( ns = entryns,
                title = entrytitle,
                text = entrytext,
                redirect = ''
                )

'''test'''
def main():
    '''Test for a dump and create a "titles" file.
    
        Downloads a newer dump.
    '''
#Download a newer dump if any exists
    #b = WikiDump(u'el', u'wiktionary', u'/home/wiki/Λήψεις/', True)
    #print b._ISOK 
    
#just chek if exists and create a "titles" file.
    b = WikiDump(u'el', u'wikipedia', u'/media/FORMANY/wiki/dumps/')
    
    
if __name__=="__main__":
    main()