Χρήστης:Vanished user Xorisdtbdfgonugyfs/dumptools.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
#Copyright Xoristzatziki of el.wiktionary.org

'''

Works for:
wikipedia
wikiversity
wikinews
wikisource
wikiquote
wikibooks
wiktionary

'''

#import time
import os, sys
import urllib2
#import pygtk
#import gtk
#import datetime
import subprocess

knownprojectdumpnames = ['wiki', 'wikiversity', 'wikinews', 'wikisource', 'wikiquote', 'wikibooks', 'wiktionary',]

basebackupurl = 'http://dumps.wikimedia.org/'
kindofdump = u'-pages-meta-current.xml'

class LatestDumpProps():
    def __init__(self):
        self.url = u''
        self.zippedfilename = u''
        self.rawdate = u''
        self.nonefound = True

def urloflatestdump(wikilang, wikikind):
    '''Returns url, filename and string-of-date (if not in progress)
    
    '''
    _wikitofind = wikilang + wikikind 
    _ldp = LatestDumpProps()     
    try:
        _response = urllib2.urlopen(basebackupurl + 'backup-index.html', None, 3)
        lines = _response.read()
        _response.close()            
        for line in lines.splitlines():
            #print line
            if _wikitofind in line[:]:
                #print line
                if '>Dump complete<' in line[:]:
                    _backupsubdir = line.split('a href=\"')[1].split('">')[0]
                    _ldp.rawdate = _backupsubdir.split('/')[1]
                    #realdate = datetime.date(int(rawdate[:4]), int(rawdate[4:6]), int(rawdate[6:8]))
                    #print realdate
                    _ldp.zippedfilename = _wikitofind + '-' + _ldp.rawdate + kindofdump + '.bz2'
                    _ldp.url = basebackupurl + _backupsubdir + '/' + _ldp.zippedfilename 
                    _ldp.nonefound = False
                    return _ldp
                else:                       
                    return _ldp
    #except IOError:
    except:
        return _ldp

def getandextractadump(propsofdump, wheretosaveit):
    '''Gets and extracts latest dump.
    
        Warning: Overwrites any excisting files.
        Backup if you want them.
        Can be used to force getting latest dump
    '''
    _ldp = propsofdump
    _saveitin = wheretosaveit
    #create file names and path of bz2 and xml files
    bzFilenamePath = os.path.join(_saveitin, _ldp.zippedfilename)  
    #self.dumpfilenamepath = zFilenamePath[:-4]            
    #remove old bz2
    if os.path.exists(bzFilenamePath):
        try:
            os.remove(bzFilenamePath) 
        except:#oops...
            print "Exception: ", str(sys.exc_info()) 
            return False
    #else:
        #print 'File not found.'                          
    try:
        #prepare a new bz2 file
        with open(bzFilenamePath, 'w') as bzFile:
            #get from internet bz2 file
            bzFile.write(urllib2.urlopen(_ldp.url).read())
        print 'Got bz2 file:', bzFilenamePath  
        #unzip bz2 dump using bunzip2
        #to the same place as downladed file
        #with the same name
        xmlfilename = os.path.splitext(os.path.basename(bzFilenamePath))[0]            
        funziped = open(xmlfilename, 'w') 
        subprocess.call(['bunzip2', '-f', bzFilenamePath], stdout=funziped) 
        #also source (.bz2 file) will be deleted                                              
        print 'Extracted:', bzFilenamePath                
        return True, xmlfilename
    except IOError:
        return False,''
    except:#oops...
        print "Exception: ", str(sys.exc_info()) 
        return False,''

def titlesfilename(xmlfilename):
    return xmlfilename + '.titles'

def unwiki(whichtext):
    _text = whichtext.decode('utf-8')
    # unescape characters
    _text = _text.replace('>', '>')
    _text = _text.replace('<', '<')
    _text = _text.replace('"', '"')
    _text = _text.replace('&', '&') #must be last
    return  _text

def get_site_from_dumpname(whichdumpname):
    #print 'inside'
    #print whichdumpname
    simpledumpname = os.path.basename(whichdumpname)#just in case
    #print simpledumpname
    simpledumpname = simpledumpname.split('-',1)[0]
    #print simpledumpname
    for x in knownprojectdumpnames:
        #print x
        if simpledumpname.endswith(x):
            project = x
            if project == 'wiki':
                project = 'wikipedia'
            lang = simpledumpname[:-len(x)]
            #print project, lang
            return project, lang
    return '',''

def create_a_titles_file(xmlfilename, ftitles):
    try:
        with open(ftitles, 'w') as f:# = open(ftitles, 'w') #using grep
            subprocess.call(['grep', '-b', '-E', '<title>.+</title>',  xmlfilename], stdout=f)
            print '"titles" file created...', ftitles
        return True
    except:#oops...
        print "Exception: ", str(sys.exc_info()) 
        return False

def create_newer_titles_file(xmlfilename,forcecreation = False):#TODO unused parameter
    '''Checks if a newer "title file" exists.
    
        If newer file does not exist means we have to create one.
        If no source exists (aka the xml dump) then return False.
    '''
    if not os.path.exists(xmlfilename):
        #print 'no source file...'
        return False
    if xmlfilename.endswith('.titles'):
        #print 'source file is titles file...'
        return False           
    ftitles = titlesfilename(xmlfilename)
    #print ftitles    
    if os.path.exists(ftitles):
        #print '"titles" file exist. Checking dates...'
        if os.path.getmtime(xmlfilename) > os.path.getmtime(ftitles):
            #print '"titles" file is old. Creating new...'
            return create_a_titles_file(xmlfilename, ftitles)
        else:
            #print '"titles" file is ok.'
            return True 
    else:       
        return create_a_titles_file(xmlfilename, ftitles)      
    #print 'came here...'
    return False

class GetWikiText:
    def __init__( self):
        #self.texttofind = whichtext
        #self.infile = infile
        #self.data = data
        #print self.infile
        pass

    def get_from_online(self):
        commandvars = ['curl', '--retry', '10', '-s', '-f']
        commandurl = self.site + 'w/api.php?format=xml&action=query&prop=revisions&titles='
        commandurl += self.texttofind.decode('utf-8') + '&rvprop=user|content'
        commandvars.append(commandurl)
        try:
            content = subprocess.check_output(commandvars)
            return True, unwiki(content)
        except subprocess.CalledProcessError as e:
            errorcode = e.returncode
            return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_from_online()'

    def get_using_titles(self,titlesfile):
        lasttitle = False
        try:
            lines = subprocess.check_output(['grep', '-m','1','-A', '1', '>' + self.texttofind + '<',  titlesfile])
            startlines = lines.splitlines()
            #print startlines
            start1 = long(startlines[0].split(':',1)[0])
            if len(startlines)>1:
                start2 = long(startlines[1].split(':',1)[0])
            #print 'start'
            with open(self.infile, 'r') as f:
                f.seek(start1)
                #print f.tell()
                if len(startlines)>1:
                    content = f.read(start2-start1)
                else:
                    #print 'else'
                    content = f.read()
            return True, unwiki( content)
        #startline1 = startline.split('\n',1)
        except subprocess.CalledProcessError as e:
            errorcode = e.returncode
            if errorcode == 1:
                return False,'Το λήμμα δεν βρέθηκε.'
            else:
                return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_using_titles()'
            
        except:
            return False,'Άγνωστο σφάλμα στο get_using_titles()'
        #return False,''# startline + '\n' + str(len(startline1))
        
    def get_text(self, data):
        #self.data = data
        self.texttofind = data.text
        self.infile = data.file
        self.fromonline = data.fromonline
        self.site = data.site
        if self.fromonline:
            return self.get_from_online()
        #print 'not online'
        #f = open(ftitles, 'w') #using grep
        #with open(ftitles, 'w')
        if not os.path.exists(self.infile):
            return False,'no file specified'
        if len(self.texttofind)<1:
            return False,'no text specified'
        b = data
        if b.wants_as_title:
            #print  'as title'
            titlesfile = titlesfilename(self.infile)
            if os.path.exists(titlesfile):                 
                return self.get_using_titles(titlesfile)            
        #else do normal search
        #either is titles file
        #or titles file does not exist
        c = ['grep']
        if b.howmany > 0:
            c.append('-m')
            c.append(str(b.howmany))
        if b.after > 0:
            c.append('-A')
            c.append(str(b.after))
        elif b.before > 0:
            c.append('-B')
            c.append(str(b.before))
        elif b.inbetween > 0:
            c.append('-C')
            c.append(str(b.inbetween))
        #else:#force 250
            #c.append('-A')
            #c.append('250')
        else:
            pass
        c.append(self.texttofind)    
        c.append(self.infile)
        try:
            print c
            contents = subprocess.check_output(c)
            return True, unwiki(contents)
        except subprocess.CalledProcessError as e:
            errorcode = e.returncode
            if errorcode == 1:
                return False,'Το όρισμα δεν βρέθηκε.'
            else:
                return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_using_titles()'
        except:
            return False,u'exception occured'

#generator=====================================================================
class XmlEntry:
    """
    Represents a reduced page.
    
        We do not check for redirects but exists for compatibility.
    """
    def __init__(self, title, ns, text, redirect):
        self.title = title
        self.ns = ns
        self.text = text
        self.isredirect = redirect
    
class WikiDump():
    def __init__(self, dumpfilenamepath):
        '''Constructor.
        
            First checks for existence of dump file.
            Checks if file has titles file.
            Checks if file is titles file and has an xml file            
            self._ISOK holds "if all OK" (dump file existed ).
        '''
        self._ISOK = False
        if not os.path.exists(dumpfilenamepath):
            return
        self.dumpfilenamepath = dumpfilenamepath
        if not dumpfilenamepath.endswith('.titles'):#αν δεν είναι αρχείο titles
            if not os.path.exists(dumpfilenamepath + '.titles'):#αν δεν υπάρχει αρχείο titles
                return
            else:#υπάρχει αρχείο titles
                self.titlesfilename = dumpfilenamepath + '.titles'
                self._ISOK = True
                return
        else:#είναι αρχείο titles
            if not os.path.exists(dumpfilenamepath [:-len('.titles')]):#αν δεν υπάρχει το κανονικό
                return
            else:#υπάρχει το κανονικό
                self.dumpfilenamepath = dumpfilenamepath [:-len('.titles')]
                self.titlesfilename = dumpfilenamepath
                self._ISOK = True
                return

    def parse(self):
        '''Yields articles from a dump (xml file)
            
            Uses a titles file.
            Does not replaces '>' etc. nor converts to utf-8
            since article may not be used
            (ex. if only ns '10' is needed).
        '''
        fxml = open(self.dumpfilenamepath , 'r')
        ftitles = open(self.titlesfilename, 'r')
        ftitleslength = os.stat(self.titlesfilename).st_size
        start = long(u'0')
        title = u''
        newstart = long(u'0')
        newtitle = u''
        textinpage = u''
        entryns = u''
        entrytitle = u''
        entrytext = u''
        for titlesline in ftitles:
            if start > 0:
                fxml.seek(start)
                nextstart = long(titlesline[:long(titlesline.find(':'))])
                nexttitle = titlesline.split('</title>')[0].split('<title>')[1]
                textinpage = fxml.read(nextstart-start)
                #print title
                if '<text xml:space="preserve" />' in textinpage:                
                    entryns = textinpage.split('<ns>')[1].split('</ns>')[0]
                    entrytitle = title
                    #can yield the entry if needed...
                    #if entryns == '10':
                        #print u'Το πρότυπο: ', entrytitle, ' είναι άδειο...........'
                        #print ':', textsplited[0],':'
                    title = nexttitle
                    start = nextstart
                    #do not yield it
                else:
                    textsplited = textinpage.split('<text xml:space="preserve">')
                    entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0]
                    entrytitle = title
                    entrytext = textsplited[1].split('</text>')[0]
                    title = nexttitle
                    start = nextstart                   
                    yield XmlEntry( ns = entryns,
                            title = entrytitle,
                            text = entrytext,
                            redirect = ''
                            )
                    
            else:
                #print 'first line'
                start = long(titlesline[:long(titlesline.find(':'))])
                title = titlesline.split('</title>')[0].split('<title>')[1]
                #print 'first title: ', title, '--------------'
        #print 'lastline'
        fxml.seek(start)
        try:
            textinpage = fxml.read(ftitleslength - start)    
        except OverflowError:#just in case
            print 'OverflowError... ',title, u'#', start, u'#', newstart, u'#'
            exit()
        fxml.close()
        ftitles.close()
        #yield rest as is. TODO crop text
        if '<text xml:space="preserve" />' in textinpage:                
            entryns = textinpage.split('<ns>')[1].split('</ns>')[0]
            entrytitle = title
            #do not yield it
        else:
            textsplited = textinpage.split('<text xml:space="preserve">')
            entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0]
            entrytitle = title
            entrytext = textsplited[1].split('</text>')[0]
            yield XmlEntry( ns = entryns,
                    title = entrytitle,
                    text = entrytext,
                    redirect = ''
                    )