Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/getnewpages.py

#!/usr/bin/python3
#new approach

#Get latest pages and update local xml dump
#v0.0.4
#Step one:
#Just get changed lemmas and save the dictionary with json.

#TODO:
#1. Update timestamp file after read.
#2. Read new revisions of articles.
#3. Merge them with the old dump.

import re
import urllib.request
import os, glob
import codecs
import time
import datetime
import json

import xml.etree.cElementTree as XMLPARSER

#mywiktionarysite = 'http://el.wiktionary.org/'
#myxmlfile = 'last_full.xml'
#mytmpdir = ''#in case someone wnats a special tmp path
#lasttimestampfile = '20140329000001'


def getnonZtime(whichZtime):
    return whichZtime[:4] + whichZtime[5:7] + whichZtime[8:10] + whichZtime[11:13] + whichZtime[14:16] + whichZtime[17:19]

def getallchanged(siteurl,Ztimestampofcurrentdump):
    allmyrecentchanges = {}
    timenow = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
    timefromage = getnonZtime(Ztimestampofcurrentdump)
    rcstart = timenow
    continuevalue =''
    stopedbyerrors = False
    while rcstart > timefromage:
        time.sleep(1)
        #print("getting changes from...")
        #print(siteurl)
        #continuevalue = ''
        #TODO check whether usage of continuevalue changed
        dictwithchanged,timetocontiuefrom = getrecentchanges(siteurl,rcstart,timefromage,continuevalue,allmyrecentchanges)#,rclimit=500)
        print("returned...")
        #on error we will not get a dictionary
        if type(dictwithchanged) == type(allmyrecentchanges):
            allmyrecentchanges = dictwithchanged
            rcstart = timetocontiuefrom.split("|")[0]
            #print('Continue from...', rcstart)
        else:
            print("ERROR")
            stopedbyerrors = True
            break
    if stopedbyerrors: timenow = timenow + '-stopedbyerrors'
    with open('latestchanges-' + timenow, 'w') as f:
        json.dump(allmyrecentchanges, f)
    print("SAVED")

def getrecentchanges(siteurl,rcstart,rcend,rccontinue,allmyrecentchanges,rclimit=500):
    try:
        urldata = { 'action':'query',
        'list':'recentchanges',
        'format':'xml',
        'rcstart':rcstart,
        'rcend':rcend,
        'uselang':'el',
        'continue':rccontinue,
        'rclimit':rclimit,
        'maxlag':1,
        'rcprop':'timestamp|title'        
        }        
        params = urllib.parse.urlencode(urldata)
        url = siteurl + 'w/api.php?%s' % params
        #print(url)
        headers = {}
        headers['User-Agent'] = "Bot For recentchanges" #"Mozilla/5.0 (X11; U; Linux ia64) Gecko/20071127 Firefox/2.0.0.11"
        req = urllib.request.Request(url, headers = headers)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        alldata = respData.decode('utf8')
        #print("len of alldata...", len(alldata) )
        root = XMLPARSER.fromstring(alldata)
        rccontinuetime = ''
        for b in root.iter('continue'):#TODO Constantly check for changes in wiki's xml creator 
            rccontinue = b.attrib['rccontinue']
        for onerev in root.iter('rc'):
            #print("found rc...")
            #Δεν χρειάζεται να προσθέσω τα λήμματα πού ήδη έχω την τελευταία αλλαγή τους
            if (not len(allmyrecentchanges)) or (onerev.attrib['title'] not in allmyrecentchanges):
                allmyrecentchanges[onerev.attrib['title']] = {'timestamp':onerev.attrib['timestamp']}
        return allmyrecentchanges,rccontinue
    except Exception as e:
        print(str(e))
        return('ERROR','')