#new approach
#Get latest pages and update local xml dump
#Step one:
#Just get changed lemmas and save the dictionary with json.
#1. Update timestamp file after read.
#2. Read new revisions of articles.
#3. Merge them with the old dump.
import re
import urllib.request
import os, glob
import codecs
import time
import datetime
import json
import xml.etree.cElementTree as XMLPARSER
#mywiktionarysite = 'http://el.wiktionary.org/'
#myxmlfile = 'last_full.xml'
#mytmpdir = ''#in case someone wnats a special tmp path
#lasttimestampfile = '20140329000001'
def getnonZtime(whichZtime):
return whichZtime[:4] + whichZtime[5:7] + whichZtime[8:10] + whichZtime[11:13] + whichZtime[14:16] + whichZtime[17:19]
def getallchanged(siteurl,Ztimestampofcurrentdump):
allmyrecentchanges = {}
timenow = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
timefromage = getnonZtime(Ztimestampofcurrentdump)
rcstart = timenow
continuevalue =''
stopedbyerrors = False
while rcstart > timefromage:
#print("getting changes from...")
#continuevalue = ''
#TODO check whether usage of continuevalue changed
dictwithchanged,timetocontiuefrom = getrecentchanges(siteurl,rcstart,timefromage,continuevalue,allmyrecentchanges)#,rclimit=500)
#on error we will not get a dictionary
if type(dictwithchanged) == type(allmyrecentchanges):
allmyrecentchanges = dictwithchanged
rcstart = timetocontiuefrom.split("|")[0]
#print('Continue from...', rcstart)
stopedbyerrors = True
if stopedbyerrors: timenow = timenow + '-stopedbyerrors'
with open('latestchanges-' + timenow, 'w') as f:
json.dump(allmyrecentchanges, f)
def getrecentchanges(siteurl,rcstart,rcend,rccontinue,allmyrecentchanges,rclimit=500):
urldata = { 'action':'query',
params = urllib.parse.urlencode(urldata)
url = siteurl + 'w/api.php?%s' % params
headers = {}
headers['User-Agent'] = "Bot For recentchanges" #"Mozilla/5.0 (X11; U; Linux ia64) Gecko/20071127 Firefox/"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
alldata = respData.decode('utf8')
#print("len of alldata...", len(alldata) )
root = XMLPARSER.fromstring(alldata)
rccontinuetime = ''
for b in root.iter('continue'):#TODO Constantly check for changes in wiki's xml creator
rccontinue = b.attrib['rccontinue']
for onerev in root.iter('rc'):
#print("found rc...")
#Δεν χρειάζεται να προσθέσω τα λήμματα πού ήδη έχω την τελευταία αλλαγή τους
if (not len(allmyrecentchanges)) or (onerev.attrib['title'] not in allmyrecentchanges):
allmyrecentchanges[onerev.attrib['title']] = {'timestamp':onerev.attrib['timestamp']}
return allmyrecentchanges,rccontinue
except Exception as e: