getallns0-2.py

#!/usr/bin/python3
#Created by Xoristzatziki for el.wiktionary.org

import os
import datetime
import time

import gzip
import sqlite3

import urllib.request
from urllib.error import URLError, HTTPError

import xml.etree.cElementTree as XMLPARSER
from bs4 import BeautifulSoup #as BeautifulSoup

version = '0.0.1'
justqueryingagentheaders = {'User-Agent':'Xoristzatziki fishing only boat','version': version}
urlforactiveprojects = 'http://noc.wikimedia.org/conf/all.dblist'
#urlforclosedwiktionaries = 'http://el.wiktionary.org/w/Χρήστης:Xoristzatziki/closedwiktionarieslist'
urlofpagetitles = 'http://dumps.wikimedia.org/other/pagetitles/'
ourwiktionary = str('el')
#pagetitles are not generated for closed wiktionaries
#get a snapshot of them from a localdirectory

#υπόθεση: Δεν υπάρχει το tab (/t) στους τίτλους
def getalllemmasonallwiktionariesinns0(working_dir):
    '''
    Returns a dictionary with all titles in our wiktionary
    and the iw iso codes of wiktionaries that have the exact title
    
    '''
    wiktslist = getactivewiktionarieslist()
    closedwiktlist = getclosedwiktionarieslist(working_dir)
    sql = '''CREATE TABLE iws (lemma text'''
    for lang in wiktslist:
        sql = sql + ', \'' + lang + '\' INTEGER'
    sql = sql + ')' 
    myconn = sqlite3.connect(":memory:")
    myconn.isolation_level = None
    myconn.row_factory = sqlite3.Row

    unreadwikts = []
    print('Got available wiktionaries list...')
    availabledates = getavailabledates()
    print('Got available dates...')
    lastdate = max(availabledates)
    print('max available')
    print(lastdate)
    #return
    #tmplist = []
    activetitles = {}
    #or load them from a file extracted from xml dump
    ok, tmplist = getlemmasforwiktionary(ourwiktionary,lastdate)
    if ok:
        myconn.execute(sql)
        myconn.executemany("insert into iws(lemma) values (?)", [(x,) for x in tmplist])
        myconn.execute('''CREATE INDEX "lemmandx" on iws (lemma ASC)''')
        row = myconn.execute('''SELECT COUNT(*) FROM iws''')
        print('rows' , row)
    else:
        return ['ERROR']    
    wiktslist.remove(ourwiktionary)
    wiktcounter = 1
    for wikt in wiktslist:
        if wikt in closedwiktlist:
            ok,tmplist = getclosedwiktionarylemmas(working_dir,wikt)
        else:
            ok,tmplist = getlemmasforwiktionary(wikt,lastdate)
        if ok:
            myconn.executemany("update iws set '" + wikt + "' = 1 WHERE lemma = ?", [(x,) for x in tmplist] )
            wiktcounter += 1
        else:
            print(wikt + ' problem')
            unreadwikts.append(wikt)
        if wiktcounter >3:
            break
    filename = os.path.join(working_dir,'alllemmas.txt')
    print('Creating '  + filename + ' ...')    
    with open(filename,'wt') as f:
        for row in myconn.execute("select * from iws"):
            b = row['lemma'] + '\t'
            for lang in wiktslist:
                if row[lang] == 1:
                    b = b + lang + ' '
            f.write(b + '\n')
    #TODO:use unreadwikts in order to not touch iws that are included
    print( "DONE")
    return "DONE"

def getactivewiktionarieslist():
    print('getting available wiktionaries...')
    wiktslist = []
    req = urllib.request.Request(urlforactiveprojects, headers = justqueryingagentheaders)
    resp = urllib.request.urlopen(req)
    respData = resp.read()
    print('type(respData)')
    print(type(respData))
    alldata = respData.decode('utf8')
    #print(alldata)
    print('Got available wiktionaries page...')
    print('creating available wiktionaries list...')
    print(len(alldata.splitlines()))
    for line in alldata.splitlines():
        unsplitted = line.strip()
        splitted = unsplitted.split('wiktionary')
        if len(splitted) == 2:
            #wiktlang = splitted[0]
            wiktslist.append(splitted[0])
    print(len(wiktslist))
    return wiktslist

def getclosedwiktionarieslist(working_dir):
    #TODO: Get list from updated url 
    print('getting available wiktionaries...')
    wiktslist = []
    with open(os.path.join(working_dir,'static','closedwiktionarieslist'),'rt') as f:
        for line in f.readlines():
            wiktslist.append(line.strip())
    return wiktslist

def getclosedwiktionarylemmas(working_dir,lang):
    try:
        with open(os.path.join(working_dir,'static',lang),'rt') as f:
            return True, [line.decode('utf-8').strip() for line in f.readlines()]
    except:
        return False, 'Error reading local file'

def getavailabledates():
    print('getting available dates...')
    req = urllib.request.Request(urlofpagetitles, headers = justqueryingagentheaders)
    resp = urllib.request.urlopen(req)
    respData = resp.read()
    print('type(respData)')
    print(type(respData))
    alldata = respData.decode('utf8')
    #parser = etree.HTMLParser()
    soup = BeautifulSoup(alldata)
    #root = context.root
    #context = etree.iterwalk(root, events=("start", "end"), tag="a")
    print('Got available dates page...')
    #root = XMLPARSER.fromstring(alldata)
    dates = []
    print('finding available dates...')
    for link in soup.find_all('a'):
        if link.get('href')[:-1].isdigit():
            dates.append(link['href'])
    return dates
    
def getlemmasforwiktionary(wiktlang, whichdate):
    #TODO check if .gz is available and go to previous date?
    #alltitlesinwikt = []
    time.sleep(1)
    url = urlofpagetitles + whichdate + wiktlang + 'wiktionary-' + whichdate[:-1] + '-all-titles-in-ns-0.gz'
    print('requesting list of ' + url)
    #req = urllib.request.Request(url, headers = justqueryingagentheaders)
    try:
        #TODO somehow add headers 
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=response) as uncompressed:
                alltitlesinwikt = [line.decode('utf-8').strip() for line in uncompressed.readlines()]
    except HTTPError as e:
        print('The server couldn\'t fulfill the request.')
        print('Error code: ', e.code)        
        return False, [e.code] 
    except URLError as e:
        print('We failed to reach a server.')
        print('Reason: ', e.reason)
        return False, [e.code] 
    print('Got list from ' + url)
    return True, alltitlesinwikt
    
if __name__ == "__main__":
    '''
    If run by itself will create the file in the path where the actual script is.
    Not in the current or working directory.    
    '''
    realfile = os.path.realpath(__file__)
    realfile_dir = os.path.dirname(os.path.abspath(realfile))
    getalllemmasonallwiktionariesinns0(realfile_dir)