#!/usr/bin/python
# -*- coding: utf-8 -*-
#Copyright Xoristzatziki of el.wiktionary.org
'''
Works for:
wikipedia
wikiversity
wikinews
wikisource
wikiquote
wikibooks
wiktionary
'''
#import time
import os, sys
import urllib2
#import pygtk
#import gtk
#import datetime
import subprocess
knownprojectdumpnames = ['wiki', 'wikiversity', 'wikinews', 'wikisource', 'wikiquote', 'wikibooks', 'wiktionary',]
basebackupurl = 'http://dumps.wikimedia.org/'
kindofdump = u'-pages-meta-current.xml'
class LatestDumpProps():
def __init__(self):
self.url = u''
self.zippedfilename = u''
self.rawdate = u''
self.nonefound = True
def urloflatestdump(wikilang, wikikind):
'''Returns url, filename and string-of-date (if not in progress)
'''
_wikitofind = wikilang + wikikind
_ldp = LatestDumpProps()
try:
_response = urllib2.urlopen(basebackupurl + 'backup-index.html', None, 3)
lines = _response.read()
_response.close()
for line in lines.splitlines():
#print line
if _wikitofind in line[:]:
#print line
if '>Dump complete<' in line[:]:
_backupsubdir = line.split('a href=\"')[1].split('">')[0]
_ldp.rawdate = _backupsubdir.split('/')[1]
#realdate = datetime.date(int(rawdate[:4]), int(rawdate[4:6]), int(rawdate[6:8]))
#print realdate
_ldp.zippedfilename = _wikitofind + '-' + _ldp.rawdate + kindofdump + '.bz2'
_ldp.url = basebackupurl + _backupsubdir + '/' + _ldp.zippedfilename
_ldp.nonefound = False
return _ldp
else:
return _ldp
#except IOError:
except:
return _ldp
def getandextractadump(propsofdump, wheretosaveit):
'''Gets and extracts latest dump.
Warning: Overwrites any excisting files.
Backup if you want them.
Can be used to force getting latest dump
'''
_ldp = propsofdump
_saveitin = wheretosaveit
#create file names and path of bz2 and xml files
bzFilenamePath = os.path.join(_saveitin, _ldp.zippedfilename)
#self.dumpfilenamepath = zFilenamePath[:-4]
#remove old bz2
if os.path.exists(bzFilenamePath):
try:
os.remove(bzFilenamePath)
except:#oops...
print "Exception: ", str(sys.exc_info())
return False
#else:
#print 'File not found.'
try:
#prepare a new bz2 file
with open(bzFilenamePath, 'w') as bzFile:
#get from internet bz2 file
bzFile.write(urllib2.urlopen(_ldp.url).read())
print 'Got bz2 file:', bzFilenamePath
#unzip bz2 dump using bunzip2
#to the same place as downladed file
#with the same name
xmlfilename = os.path.splitext(os.path.basename(bzFilenamePath))[0]
funziped = open(xmlfilename, 'w')
subprocess.call(['bunzip2', '-f', bzFilenamePath], stdout=funziped)
#also source (.bz2 file) will be deleted
print 'Extracted:', bzFilenamePath
return True, xmlfilename
except IOError:
return False,''
except:#oops...
print "Exception: ", str(sys.exc_info())
return False,''
def titlesfilename(xmlfilename):
return xmlfilename + '.titles'
def unwiki(whichtext):
_text = whichtext.decode('utf-8')
# unescape characters
_text = _text.replace('>', '>')
_text = _text.replace('<', '<')
_text = _text.replace('"', '"')
_text = _text.replace('&', '&') #must be last
return _text
def get_site_from_dumpname(whichdumpname):
#print 'inside'
#print whichdumpname
simpledumpname = os.path.basename(whichdumpname)#just in case
#print simpledumpname
simpledumpname = simpledumpname.split('-',1)[0]
#print simpledumpname
for x in knownprojectdumpnames:
#print x
if simpledumpname.endswith(x):
project = x
if project == 'wiki':
project = 'wikipedia'
lang = simpledumpname[:-len(x)]
#print project, lang
return project, lang
return '',''
def create_a_titles_file(xmlfilename, ftitles):
try:
with open(ftitles, 'w') as f:# = open(ftitles, 'w') #using grep
subprocess.call(['grep', '-b', '-E', '<title>.+</title>', xmlfilename], stdout=f)
print '"titles" file created...', ftitles
return True
except:#oops...
print "Exception: ", str(sys.exc_info())
return False
def create_newer_titles_file(xmlfilename,forcecreation = False):#TODO unused parameter
'''Checks if a newer "title file" exists.
If newer file does not exist means we have to create one.
If no source exists (aka the xml dump) then return False.
'''
if not os.path.exists(xmlfilename):
#print 'no source file...'
return False
if xmlfilename.endswith('.titles'):
#print 'source file is titles file...'
return False
ftitles = titlesfilename(xmlfilename)
#print ftitles
if os.path.exists(ftitles):
#print '"titles" file exist. Checking dates...'
if os.path.getmtime(xmlfilename) > os.path.getmtime(ftitles):
#print '"titles" file is old. Creating new...'
return create_a_titles_file(xmlfilename, ftitles)
else:
#print '"titles" file is ok.'
return True
else:
return create_a_titles_file(xmlfilename, ftitles)
#print 'came here...'
return False
class GetWikiText:
def __init__( self):
#self.texttofind = whichtext
#self.infile = infile
#self.data = data
#print self.infile
pass
def get_from_online(self):
commandvars = ['curl', '--retry', '10', '-s', '-f']
commandurl = self.site + 'w/api.php?format=xml&action=query&prop=revisions&titles='
commandurl += self.texttofind.decode('utf-8') + '&rvprop=user|content'
commandvars.append(commandurl)
try:
content = subprocess.check_output(commandvars)
return True, unwiki(content)
except subprocess.CalledProcessError as e:
errorcode = e.returncode
return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_from_online()'
def get_using_titles(self,titlesfile):
lasttitle = False
try:
lines = subprocess.check_output(['grep', '-m','1','-A', '1', '>' + self.texttofind + '<', titlesfile])
startlines = lines.splitlines()
#print startlines
start1 = long(startlines[0].split(':',1)[0])
if len(startlines)>1:
start2 = long(startlines[1].split(':',1)[0])
#print 'start'
with open(self.infile, 'r') as f:
f.seek(start1)
#print f.tell()
if len(startlines)>1:
content = f.read(start2-start1)
else:
#print 'else'
content = f.read()
return True, unwiki( content)
#startline1 = startline.split('\n',1)
except subprocess.CalledProcessError as e:
errorcode = e.returncode
if errorcode == 1:
return False,'Το λήμμα δεν βρέθηκε.'
else:
return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_using_titles()'
except:
return False,'Άγνωστο σφάλμα στο get_using_titles()'
#return False,''# startline + '\n' + str(len(startline1))
def get_text(self, data):
#self.data = data
self.texttofind = data.text
self.infile = data.file
self.fromonline = data.fromonline
self.site = data.site
if self.fromonline:
return self.get_from_online()
#print 'not online'
#f = open(ftitles, 'w') #using grep
#with open(ftitles, 'w')
if not os.path.exists(self.infile):
return False,'no file specified'
if len(self.texttofind)<1:
return False,'no text specified'
b = data
if b.wants_as_title:
#print 'as title'
titlesfile = titlesfilename(self.infile)
if os.path.exists(titlesfile):
return self.get_using_titles(titlesfile)
#else do normal search
#either is titles file
#or titles file does not exist
c = ['grep']
if b.howmany > 0:
c.append('-m')
c.append(str(b.howmany))
if b.after > 0:
c.append('-A')
c.append(str(b.after))
elif b.before > 0:
c.append('-B')
c.append(str(b.before))
elif b.inbetween > 0:
c.append('-C')
c.append(str(b.inbetween))
#else:#force 250
#c.append('-A')
#c.append('250')
else:
pass
c.append(self.texttofind)
c.append(self.infile)
try:
print c
contents = subprocess.check_output(c)
return True, unwiki(contents)
except subprocess.CalledProcessError as e:
errorcode = e.returncode
if errorcode == 1:
return False,'Το όρισμα δεν βρέθηκε.'
else:
return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_using_titles()'
except:
return False,u'exception occured'
#generator=====================================================================
class XmlEntry:
"""
Represents a reduced page.
We do not check for redirects but exists for compatibility.
"""
def __init__(self, title, ns, text, redirect):
self.title = title
self.ns = ns
self.text = text
self.isredirect = redirect
class WikiDump():
def __init__(self, dumpfilenamepath):
'''Constructor.
First checks for existence of dump file.
Checks if file has titles file.
Checks if file is titles file and has an xml file
self._ISOK holds "if all OK" (dump file existed ).
'''
self._ISOK = False
if not os.path.exists(dumpfilenamepath):
return
self.dumpfilenamepath = dumpfilenamepath
if not dumpfilenamepath.endswith('.titles'):#αν δεν είναι αρχείο titles
if not os.path.exists(dumpfilenamepath + '.titles'):#αν δεν υπάρχει αρχείο titles
return
else:#υπάρχει αρχείο titles
self.titlesfilename = dumpfilenamepath + '.titles'
self._ISOK = True
return
else:#είναι αρχείο titles
if not os.path.exists(dumpfilenamepath [:-len('.titles')]):#αν δεν υπάρχει το κανονικό
return
else:#υπάρχει το κανονικό
self.dumpfilenamepath = dumpfilenamepath [:-len('.titles')]
self.titlesfilename = dumpfilenamepath
self._ISOK = True
return
def parse(self):
'''Yields articles from a dump (xml file)
Uses a titles file.
Does not replaces '>' etc. nor converts to utf-8
since article may not be used
(ex. if only ns '10' is needed).
'''
fxml = open(self.dumpfilenamepath , 'r')
ftitles = open(self.titlesfilename, 'r')
ftitleslength = os.stat(self.titlesfilename).st_size
start = long(u'0')
title = u''
newstart = long(u'0')
newtitle = u''
textinpage = u''
entryns = u''
entrytitle = u''
entrytext = u''
for titlesline in ftitles:
if start > 0:
fxml.seek(start)
nextstart = long(titlesline[:long(titlesline.find(':'))])
nexttitle = titlesline.split('</title>')[0].split('<title>')[1]
textinpage = fxml.read(nextstart-start)
#print title
if '<text xml:space="preserve" />' in textinpage:
entryns = textinpage.split('<ns>')[1].split('</ns>')[0]
entrytitle = title
#can yield the entry if needed...
#if entryns == '10':
#print u'Το πρότυπο: ', entrytitle, ' είναι άδειο...........'
#print ':', textsplited[0],':'
title = nexttitle
start = nextstart
#do not yield it
else:
textsplited = textinpage.split('<text xml:space="preserve">')
entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0]
entrytitle = title
entrytext = textsplited[1].split('</text>')[0]
title = nexttitle
start = nextstart
yield XmlEntry( ns = entryns,
title = entrytitle,
text = entrytext,
redirect = ''
)
else:
#print 'first line'
start = long(titlesline[:long(titlesline.find(':'))])
title = titlesline.split('</title>')[0].split('<title>')[1]
#print 'first title: ', title, '--------------'
#print 'lastline'
fxml.seek(start)
try:
textinpage = fxml.read(ftitleslength - start)
except OverflowError:#just in case
print 'OverflowError... ',title, u'#', start, u'#', newstart, u'#'
exit()
fxml.close()
ftitles.close()
#yield rest as is. TODO crop text
if '<text xml:space="preserve" />' in textinpage:
entryns = textinpage.split('<ns>')[1].split('</ns>')[0]
entrytitle = title
#do not yield it
else:
textsplited = textinpage.split('<text xml:space="preserve">')
entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0]
entrytitle = title
entrytext = textsplited[1].split('</text>')[0]
yield XmlEntry( ns = entryns,
title = entrytitle,
text = entrytext,
redirect = ''
)