Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/downloadns0
getallns0-2.py
#!/usr/bin/python3 #Created by Xoristzatziki for el.wiktionary.org import os import datetime import time import gzip import sqlite3 import urllib.request from urllib.error import URLError, HTTPError import xml.etree.cElementTree as XMLPARSER from bs4 import BeautifulSoup #as BeautifulSoup version = '0.0.1' justqueryingagentheaders = {'User-Agent':'Xoristzatziki fishing only boat','version': version} urlforactiveprojects = 'http://noc.wikimedia.org/conf/all.dblist' #urlforclosedwiktionaries = 'http://el.wiktionary.org/w/Χρήστης:Xoristzatziki/closedwiktionarieslist' urlofpagetitles = 'http://dumps.wikimedia.org/other/pagetitles/' ourwiktionary = str('el') #pagetitles are not generated for closed wiktionaries #get a snapshot of them from a localdirectory #υπόθεση: Δεν υπάρχει το tab (/t) στους τίτλους def getalllemmasonallwiktionariesinns0(working_dir): ''' Returns a dictionary with all titles in our wiktionary and the iw iso codes of wiktionaries that have the exact title ''' wiktslist = getactivewiktionarieslist() closedwiktlist = getclosedwiktionarieslist(working_dir) sql = '''CREATE TABLE iws (lemma text''' for lang in wiktslist: sql = sql + ', \'' + lang + '\' INTEGER' sql = sql + ')' myconn = sqlite3.connect(":memory:") myconn.isolation_level = None myconn.row_factory = sqlite3.Row unreadwikts = [] print('Got available wiktionaries list...') availabledates = getavailabledates() print('Got available dates...') lastdate = max(availabledates) print('max available') print(lastdate) #return #tmplist = [] activetitles = {} #or load them from a file extracted from xml dump ok, tmplist = getlemmasforwiktionary(ourwiktionary,lastdate) if ok: myconn.execute(sql) myconn.executemany("insert into iws(lemma) values (?)", [(x,) for x in tmplist]) myconn.execute('''CREATE INDEX "lemmandx" on iws (lemma ASC)''') row = myconn.execute('''SELECT COUNT(*) FROM iws''') print('rows' , row) else: return ['ERROR'] wiktslist.remove(ourwiktionary) wiktcounter = 1 for wikt in wiktslist: if wikt in closedwiktlist: ok,tmplist = getclosedwiktionarylemmas(working_dir,wikt) else: ok,tmplist = getlemmasforwiktionary(wikt,lastdate) if ok: myconn.executemany("update iws set '" + wikt + "' = 1 WHERE lemma = ?", [(x,) for x in tmplist] ) wiktcounter += 1 else: print(wikt + ' problem') unreadwikts.append(wikt) if wiktcounter >3: break filename = os.path.join(working_dir,'alllemmas.txt') print('Creating ' + filename + ' ...') with open(filename,'wt') as f: for row in myconn.execute("select * from iws"): b = row['lemma'] + '\t' for lang in wiktslist: if row[lang] == 1: b = b + lang + ' ' f.write(b + '\n') #TODO:use unreadwikts in order to not touch iws that are included print( "DONE") return "DONE" def getactivewiktionarieslist(): print('getting available wiktionaries...') wiktslist = [] req = urllib.request.Request(urlforactiveprojects, headers = justqueryingagentheaders) resp = urllib.request.urlopen(req) respData = resp.read() print('type(respData)') print(type(respData)) alldata = respData.decode('utf8') #print(alldata) print('Got available wiktionaries page...') print('creating available wiktionaries list...') print(len(alldata.splitlines())) for line in alldata.splitlines(): unsplitted = line.strip() splitted = unsplitted.split('wiktionary') if len(splitted) == 2: #wiktlang = splitted[0] wiktslist.append(splitted[0]) print(len(wiktslist)) return wiktslist def getclosedwiktionarieslist(working_dir): #TODO: Get list from updated url print('getting available wiktionaries...') wiktslist = [] with open(os.path.join(working_dir,'static','closedwiktionarieslist'),'rt') as f: for line in f.readlines(): wiktslist.append(line.strip()) return wiktslist def getclosedwiktionarylemmas(working_dir,lang): try: with open(os.path.join(working_dir,'static',lang),'rt') as f: return True, [line.decode('utf-8').strip() for line in f.readlines()] except: return False, 'Error reading local file' def getavailabledates(): print('getting available dates...') req = urllib.request.Request(urlofpagetitles, headers = justqueryingagentheaders) resp = urllib.request.urlopen(req) respData = resp.read() print('type(respData)') print(type(respData)) alldata = respData.decode('utf8') #parser = etree.HTMLParser() soup = BeautifulSoup(alldata) #root = context.root #context = etree.iterwalk(root, events=("start", "end"), tag="a") print('Got available dates page...') #root = XMLPARSER.fromstring(alldata) dates = [] print('finding available dates...') for link in soup.find_all('a'): if link.get('href')[:-1].isdigit(): dates.append(link['href']) return dates def getlemmasforwiktionary(wiktlang, whichdate): #TODO check if .gz is available and go to previous date? #alltitlesinwikt = [] time.sleep(1) url = urlofpagetitles + whichdate + wiktlang + 'wiktionary-' + whichdate[:-1] + '-all-titles-in-ns-0.gz' print('requesting list of ' + url) #req = urllib.request.Request(url, headers = justqueryingagentheaders) try: #TODO somehow add headers with urllib.request.urlopen(url) as response: with gzip.GzipFile(fileobj=response) as uncompressed: alltitlesinwikt = [line.decode('utf-8').strip() for line in uncompressed.readlines()] except HTTPError as e: print('The server couldn\'t fulfill the request.') print('Error code: ', e.code) return False, [e.code] except URLError as e: print('We failed to reach a server.') print('Reason: ', e.reason) return False, [e.code] print('Got list from ' + url) return True, alltitlesinwikt if __name__ == "__main__": ''' If run by itself will create the file in the path where the actual script is. Not in the current or working directory. ''' realfile = os.path.realpath(__file__) realfile_dir = os.path.dirname(os.path.abspath(realfile)) getalllemmasonallwiktionariesinns0(realfile_dir)