#!/usr/bin/python
# -*- coding: utf-8 -*-
#
#v.0104
"""
Checks Translation tables in wiktionaries that use the schema:
{{translationtablestart|optionalexplanation}}
* {{isocode}} : {{templateforlanguage|isocode|translation}}
{{translationtablemiddle}}
{{translationtableend}}
Possibly using some template for redirecting translations
Creates sorted translation tables plus one new which will include all problematic
(if such table does not already exists).
If table is corrupted reports only the problem and returns!
Use to check something in a dump.
"""
#TODO 3. get language from some conf file or, better, from dump's xml first tag .
import os
import sys
import simplexmlreader as xmlreader
import codecs
import argparse
#import clsfetchnewdump
from collections import OrderedDict
fproblematic=codecs.open('W_problematictable', 'w', 'utf-8')
mylang = u'el'
myproject = u'wiki'
#mydumpspath = u'/home/wiki/Λήψεις/'
mydumpspath = u'/media/FORMANY/wiki/dumps/'
RETURNPROBLEMATIC = 1
RETURNOK = 0
atona = u'αεηιουωϊϋ'#.split(',')
entona = u'άέήίόύώΐΰ'#.split(',')
#general function
def extract_lang_iso(wholeline):
"""Extracts possible iso code from translations
ISO code is the first found "Template" in line.
"""
return wholeline.split('}}')[0].split('{{')[1]
def cleartranslationtable(whichtable):
'''
Deletes all isos without translation (hidden or visible)
from a sorted dict
'''
for x in whichtable:
if whichtable[x] == '':
del whichtable[x]
return whichtable
def writeunfinished(lemma):
with codecs.open('W_unfinished', 'a', 'utf-8') as f:
f.write(lemma + u'\n')
def extractsomething2(whichlemma, whichtext):#this function is constantly changed to extract something
a = whichlemma.decode('utf-8')
b = whichtext.decode('utf-8')
# _text = b.replace('>', '>')
# _text = _text.replace('<', '<')
# _text = _text.replace('"', '"')
# b = _text.replace('&', '&') #must be last
if u'Κατηγορία:Κατάλογοι που αφορούν την Κύπρο' in b:
with codecs.open('ΚατΚύπρ.txt', 'a', 'utf-8') as f:
f.write( a + u'\n')
#print a
return 1
return 0
def extractsomething4(whichlemma, whichtext, lemmas):
#βρες τα θηλυκά σε "-μένη" που δεν έχουν "μορφή μετοχής"
#φυσικά... θα φέρει και όσα έχουν "μορφή επιθέτου"
a = whichlemma.decode('utf-8')
b = whichtext.decode('utf-8')
exeiepitheto = u'*0\t'
if a.endswith(u'μένη'):
if u'==={{μορφή μετοχής|el}}===' not in b:
c = a[:len(a)-1]+u'ος'#το παροξύτονο αρσενικό του
#print c
#αλλά μπορεί το αρσενικό του να είναι προπαροξύτονο
d = a[:len(a)-4]
if u'==={{μορφή επιθέτου|el}}===' in b:
exeiepitheto = u'*1\t'
for xcounter in range(3):
#ποιο γράμμα να ελέγξουμε
lettertocheck = len(d) - xcounter
if d[lettertocheck-1:lettertocheck] in atona:
#print lettertocheck
#αν έχει φωνήεν (άτονο φυσικά)
#φέρε το αντίστοιχο τονισμένο
newletter = atona.find(d[lettertocheck-1:lettertocheck])
#αντικατέστησέ το και φτιάξε το "πιθανό προπαροξύτονο αρσενικό" της λέξης
d = d[:lettertocheck-1] + entona[newletter:newletter+1] + d[lettertocheck+1:] + u'μενος'
#βρήκαμε το φωνήεν
break
#print d
#exit()
if c in lemmas:#αν υπάρχει το παροξύτονο αρσενικό δεν θα ελεγχθεί η ύπαρξη προπαροξύτονου
with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f:
f.write(exeiepitheto + u'1\t[[' + a + u']]\t' + c + u'\n')
return 1
if d in lemmas:#αν υπάρχει το προπαροξύτονο αρσενικό
with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f:
f.write(exeiepitheto + u'2\t[[' + a + u']]\t'+ d + u'\n')
return 1
with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f:
#δεν υπάρχει ούτε παροξύτονο ούτε προπαροξύτονο
#Μπορεί πράγματι να μην υπάρχει (π.χ. "Ωραία Κοιμωμένη"),
#αλλά μπορεί απλά να μην έχει καταχωρηθεί.
#θα μπορούσαν να μπουν σε διαφορετικό αρχείο αλλά είναι λίγα τελικά...
f.write(exeiepitheto + u'3\t[[' + a + u']]\n')
#print d
#exit()
return 1
#print a
#return 1
return 0
def extractsomething3(whichlemma, whichtext):#this function is constantly changed to extract something
a = whichlemma.decode('utf-8')
b = whichtext.decode('utf-8')
#c = langsortdata.el.mappedletters
langs = {}
langssplit1 = b.split('-}}==\n')
try:
if len(langssplit1)>1:
prevlang = langssplit1[0].split('=={{-')[1]
prevtext = u''
#print a
#print langssplit1
#for text in langssplit1:
for xcounter in range(1,len(langssplit1)-1):
splitted = langssplit1[xcounter].split('\n=={{-')
#print splitted
langs[prevlang] = splitted [0]
prevlang = splitted[1]
#for lang in langs:
#print lang
return 0, ''
elif len(langssplit1) == 0:
return 1, a + u'\tno lang\n'
else:
if b.startswith(u'#ΑΝΑΚΑΤΕΥΘΥΝΣΗ') or b.startswith(u'#REDIRECT'):
with codecs.open(fredirect, 'a', 'utf-8') as f:
f.write( a + u'\t' + b.split('\n')[0] + u'\n')
else:
with codecs.open(fother, 'a', 'utf-8') as f:
f.write( u'* [[' + a + u']]\t#' + b.split('\n')[0] + u'#\n')
return 0, ''
except IndexError:
return 1, a + u'\tindex error\n'
def extractsomething(whichlemma, whichtext):#this function is constantly changed to extract something
a = whichlemma.decode('utf-8')
b = whichtext.decode('utf-8')
if u'{{język starogrecki' in b:
# _text = b.replace('>', '>')
# _text = _text.replace('<', '<')
# _text = _text.replace('"', '"')
# b = _text.replace('&', '&') #must be last
if u'[[el:' + a + u']]' not in b:
with codecs.open('noiwel.txt', 'a', 'utf-8') as f:
f.write(a + u'\n')
print a
if u'{{język nowogrecki' not in b:
with codecs.open('noelinpl.txt', 'a', 'utf-8') as f:
f.write(a + u'\n')
return 1
return 0
class LangStrings():
'''Reads strings of translation table for the language.
Creates a sorted dict with iso codes of all known langs
to use with translation table.
'''
def __init__(self, whichlang):
self.langlangs = __import__('wiktlangdata.%s' % whichlang, fromlist=['wiktlangdata'])
self.whichlang = whichlang
self.langsort = __import__('sorting')
self.sortednames = []
self.sortedisos = []
def isocodeisknown(self, isocode):
return (isocode in self.langlangs.langnames)
def startoftablestring(self):
return self.langlangs.startoftablestring
def middleoftablestring(self):
return self.langlangs.middleoftablestring
def endoftablestring(self):
return self.langlangs.endoftablestring
def redirectionsstring(self):
return self.langlangs.redirectionsstring
def problemtablestring(self):
return self.langlangs.problemtablestring
def isosinorder(self):
'''Creates an ordered dict with all known iso codes
in specified language with empty values for each one of them.
Time consuming....
'''
b = OrderedDict(sorted(self.langlangs.langnames.items(), key=lambda t: self.langsort.Word(self.whichlang, self.langlangs.langnames[t[0]])))
#for x in b:
#print b[x]
#exit()
#clear values
for x in b:
b[x] = u''
self._readydict = b
#for x in self._readydict:
#self._readydict[x] = 0
#return empty dictionary with lang codes sorted by language names
return b
class ProcessTranslationsOfLemma():
def __init__(self, whichlang, whichlemma, whichtext,usingLangStrings):
""" Constructor
"""
self.ls = usingLangStrings
self.lemma = whichlemma.decode('utf-8')
_text = whichtext.decode('utf-8')
# unescape characters
_text = _text.replace('>', '>')
_text = _text.replace('<', '<')
_text = _text.replace('"', '"')
_text = _text.replace('&', '&') #must be last
_trstart = _text.find(self.ls.startoftablestring())
self.notranslationsstart = False
if _trstart > 0 :
_text =_text[_trstart:]
else:
if _text.find(self.ls.middleoftablestring()) > -1 or _text.find(self.ls.endoftablestring()) > -1 :
pass #no start of tables but middle or end
else:
self.notranslationsstart = True
#print _text
#exit()
self.lines = _text.splitlines()
self.tablescounter = 0 #will hold the number of translation tables found
self.translations = {} #will hold which is the "title" of every translation table found
self.alltraslationtables = {} #
self.onetranslationtable = {} #will hold contents of one found traslation table
#iso code as key and full line as value
self.doubletranslation = False
self.started = False
self.tablewithproblems = {} #single table that will hold all problems found in all tables
self.problemsfoundintable = 0
def _processpossibletranslation(self, oneline):
"""Checks if a translation line for the ISO lang exists and if is sorted
Assumes a line with hidden or normal translation
"""
langISO = extract_lang_iso(oneline)
if self.ls.isocodeisknown(langISO):
if langISO in self.onetranslationtable:
#print 'double: ',langISO, ' (' , self.lemma,')'
self.problemsfoundintable += 1
#self.tablewithproblems[u'Pr-Dbl-' + unicode(self.problemsfoundintable)] = oneline #debuging problems
self.tablewithproblems[ unicode(self.problemsfoundintable)] = oneline
else:
self.onetranslationtable[langISO] = oneline
#if oneline.startswith(u'*'):
#self.ls._readydict[langISO] += 1
else:
#print 'unknown iso: ',langISO, ' (' , self.lemma,')'
self.problemsfoundintable += 1
#self.tablewithproblems[u'Pr-iso-' + unicode(self.problemsfoundintable)] = oneline #debuging problems
self.tablewithproblems[unicode(self.problemsfoundintable)] = oneline
def _isdummyline(self, oneline):
if oneline.startswith(self.ls.middleoftablestring()):#center of table
return True
elif oneline.strip() == '':#empty line
return True
else:
return False
def findandchecktranslations(self):
"""Checks translations of one entry for errors.
Processes lines. Tables are contained in a "self" dict named alltraslationtables.
Their keys are numbers from counter.
Each item of the dict contains a tuple with title found, another dict with translations
and another dict with problems.
Dict with translations contains lines of that table and iso codes as keys.
Since redirections do not have isocode
any "table" dict that has a redirection contains only a key '0' and the line.
Dict with problems also does not have isocodes so his members are counted
and counter is used as key.
"""
#if self.notranslationsstart:
#return
tablesfound = 0
tabletitle = u''
self.problemsfoundintable = 0
self.alltraslationtables = {}
self.tablewithproblems = {}
self.onetranslationtable = OrderedDict()
#only one line with redirection
self.redirectionstr = u''
redirectfound = False
if self.notranslationsstart:
return self.alltraslationtables
for oneline in self.lines:
if oneline.startswith(self.ls.startoftablestring()):#start of a translation table
if self.started:#report problem and return. Do not search that title untill problem is fixed
#fproblematic.write(thislemma)
#print 'table not ended________________ ', ' (' , self.lemma,')'
with codecs.open('unfinished.txt', 'a', 'utf-8') as fproblem:
fproblem.write(self.lemma + u'\n')
return RETURNPROBLEMATIC
tablesfound += 1 #increase table counter
tabletitle = oneline #table "title"
self.problemsfoundintable = 0 #restart counting problems
self.started = True #set that a table has begun
self.onetranslationtable = OrderedDict() #clear translation table
self.tablewithproblems = {} #clear table with probles for the above translation table
self.redirectionstr = u''
redirectfound = False
#TODO
#elif oneline.startswith(self.ls.problemtablestring):#start of a table with problems
elif oneline.startswith(self.ls.endoftablestring()):#found end of table
if not self.started:#report problem and return. Do not search that title untill problem is fixed
#fproblematic.write(thislemma)
print 'table not started.......... ', ' (' , self.lemma,')'
with codecs.open('notstarted.txt', 'a', 'utf-8') as fproblem:
fproblem.write(self.lemma + u'\n')
return RETURNPROBLEMATIC
else:#end of that traslation table
#add the table to translation tables
if len(self.tablewithproblems):
b = self.ls.problemtablestring, self.tablewithproblems
else:
b = None
if redirectfound:
c = self.redirectionstr
else:
c = None
#tablesorted = self._sorttables()
self.alltraslationtables[unicode(tablesfound)] = tabletitle, self.onetranslationtable, b, c
#not inside a table after here
self.started = False
elif self.started:
#TODO change it to specific lang start
if oneline.startswith(u'* {{') or oneline.startswith(u'<!-- * {{'):
#possible translation visible or hidden
self._processpossibletranslation(oneline)
elif self._isdummyline(oneline):
#nothing to do. Do not add that line.
continue
elif oneline.startswith(self.ls.redirectionsstring()):
#line with redirection
if redirectfound:
print 'table with at least two redirections________________ ', ' (' , self.lemma,')'
with codecs.open('tworedirs.txt', 'a', 'utf-8') as fproblem:
fproblem.write(self.lemma + u'\n')
return RETURNPROBLEMATIC
else:
redirectfound = True
#add the line
self.redirectionstr = oneline
else: #problematic line, add it to problematic table
#increase problem counter for that translation table
self.problemsfoundintable += 1
#self.tablewithproblems[u'Pr-other-' + unicode(self.problemsfoundintable)] = oneline #debuging problems
self.tablewithproblems[unicode(self.problemsfoundintable)] = oneline
else: #not inside a table
pass
#all lines searched
if self.started:
#last table has not been closed
#print 'table not ended________________ ', ' (' , self.lemma,')'
with codecs.open('unfinished.txt', 'a', 'utf-8') as fproblem:
fproblem.write(self.lemma + u'\n')
return RETURNPROBLEMATIC
return self.alltraslationtables
def checktranslation(langofwiktionary, lemma, lemmatext, usingLangStrings):
#print "Working on..." , lemma
b = usingLangStrings
#get sorted isocodes
#emptysortedisos = b.isosinorder()
#exit()
mycenter = b.middleoftablestring()
myend = b.endoftablestring()
m = ProcessTranslationsOfLemma(langofwiktionary, lemma, lemmatext, b)
lemmaalltables = m.findandchecktranslations()
if lemmaalltables == RETURNPROBLEMATIC:
print 'problematic tables in ', lemma
elif len(lemmaalltables):
#print '============================ START OF TABLES FOR: ', lemma , ' =============================='
for tablecounter, onetable in enumerate(sorted(lemmaalltables)):
#print '===== Table num: ', tablecounter + 1, '====='
tabletitle = lemmaalltables[onetable][0]
unsortedtablewithtranslations = lemmaalltables[onetable][1]
hasproblems = (lemmaalltables[onetable][2] != None)
if hasproblems:
tablewithproblemsstring = lemmaalltables[onetable][2][0]
tablewithproblems = lemmaalltables[onetable][2][1]
tablewithredirect = lemmaalltables[onetable][3]
print 'Tablestart: '
middleoftable = - (len(unsortedtablewithtranslations) / -2)
linesprintedcounter = 0
print tabletitle
#sort table and insert Template for middle of table
sortedtable = OrderedDict(b._readydict)
unsortedtablewithtranslations = cleartranslationtable(unsortedtablewithtranslations)
print sortedtable
print unsortedtablewithtranslations
for x in unsortedtablewithtranslations:
sortedtable[x] = unsortedtablewithtranslations[x]
print 'sorted...'
sortedtable = cleartranslationtable(sortedtable)
#for x in sortedtable:
#print sortedtable[x]
#print 'end sorted..'
#sortedtable = cleartranslationtable(sortedtable)
#oldtable = []
#newtable = []
#linesprintedcounter = 0
#for x in sortedtable:
#if sortedtable[x] <> u'':
#linesprintedcounter += 1
# newtable.append(sortedtable[x])
#if linesprintedcounter == middleoftable:
#newtable.append(mycenter)
#for x in unsortedtablewithtranslations:
#oldtable.append(sortedtable[x])
#if oldtable == newtable:
#print '\tTable is sorted (place of "Template for center" is not checked)'
#exit()
#else:
#print tabletitle , ' is unsorted or with problems'
#linesprintedcounter = 0
#print newtable
#for x in newtable:
#linesprintedcounter += 1
#print x
#if linesprintedcounter == middleoftable:
#print mycenter
#print myend
print tabletitle , ' is unsorted or with problems'
linesprintedcounter = 0
for x in sortedtable:
linesprintedcounter += 1
print sortedtable[x]
if linesprintedcounter == middleoftable:
print mycenter
print myend
print 'Table end: ', tablecounter + 1, '---------'
if tablewithredirect != None:
#pass
print 'Table with redirect start: ', tabletitle
print tablewithredirect
print myend
print 'end table with redirect: ', tablecounter, '---------'
#exit()
if hasproblems:
#pass
middleoftable = - (len(tablewithproblems) / -2)
linesprintedcounter = 0
print 'Table with problems start: ????????'
print b.problemtablestring()
print tablewithproblems
for tableline in tablewithproblems:
if linesprintedcounter == middleoftable:
print mycenter
#exit()
print tableline, tablewithproblems[tableline] #'.....', tableline, lemmaalltables[x][1][tableline]
linesprintedcounter += 1
print myend
print 'end table with problems. ', '????????'
#exit()
print '============================ END OF TABLES FOR: ', lemma , ' =============================='
#else:
#print '============================ no translation tables for: ', lemma
def main2(langofwiktionary):
with codecs.open('translation.txt', 'r', 'utf-8') as f:
onetransltext = f.read()
#for checking class?
def main(passedargs):
langofwiktionary = passedargs[1]
#if not passedargs[5]:
try:
usingLangStrings = LangStrings(langofwiktionary)
#next statement will also create an empty sorted _readydict
sortedisos = usingLangStrings.isosinorder()
except:
usingLangStrings = None
try:
if passedargs[4]:#just read a file with translations
#with codecs.open('onetranslation.txt', 'r', 'utf-8') as f:
with open('onetranslation.txt', 'r') as f:
onetransltext = f.read()
checktranslation(langofwiktionary, 'dummy', onetransltext, usingLangStrings)
else:
#print passedargs
#exit()
if len(passedargs[7]):
dump = xmlreader.WikiDump(langofwiktionary, passedargs[7], mydumpspath, passedargs[2], passedargs[3])
else:
dump = xmlreader.WikiDump(langofwiktionary, u'wiktionary', mydumpspath, passedargs[2], passedargs[3])
#dump = xmlreader.XmlDump(passedargs[2])
if dump._ISOK:
#βρίσκουμε το αρχείο με τους τίτλους σκέτους
dumpfilename, titlesfilename = dump.workingdump_filenames()
with codecs.open(titlesfilename, 'r', 'utf-8') as ftemp:
fulllines = ftemp.readlines()
#δημιουργούμε ένα list μόνο με τους τίτλους (χωρίς τη θέση κλπ)
onlylemmas = []
for line in fulllines:
onlylemmas.append(line.split('<title>')[1].split('</title>')[0])
lemmascounter = 0
found = 0
for entry in dump.parse():
#print entry.ns
lemmascounter += 1
if(entry.ns == '0'):
#print entry.title
if passedargs[6]:#αν θέλει να βρει κάτι μπορεί να χρησιμοποιήσει και το αρχείο των τίτλων
foundsomething = extractsomething4(entry.title, entry.text, onlylemmas)
if foundsomething:
found += 1
print found, '/', lemmascounter
#with codecs.open('el-Κατηγορίες.txt', 'a', 'utf-8') as f:
#f.write( entry.title.decode('utf-8') + '\n')
#found += 1
else:
checktranslation(langofwiktionary, entry.title, entry.text, usingLangStrings)
#alltrnasls = sorted(usingLangStrings._readydict, key=lambda t: t[0]]))
#for x in usingLangStrings._readydict:
#print usingLangStrings.langlangs.langnames[x], ':', x, ':', usingLangStrings._readydict[x]
else:
print 'not all ok'
finally:
fproblematic.close()
print "Έτοιμα!"
def getonearticle():
print 'came here to get one one article...'
exit()
def _getargs():
'''Parameters:
-n, --onlyone means we want to check a simple translation table
in a text file named onetranslation.txt
-l, --lang (=a language iso code) defaults to that lang
as lang of wiktionary
-c, --checklatest if a newer dump exists, get it
-g, --getlatest get latest dump
'''
_parser = argparse.ArgumentParser()
#if user is online may want to check for newer dump.
#the default xml filename is given at start of this python sctript.
_parser.add_argument("-a", "--articlename", default = u'')
_parser.add_argument("-c", "--checklatest", action="store_true")
_parser.add_argument("-e", "--extractsomething", action="store_true")
_parser.add_argument("-g", "--getlatest", action="store_true")
_parser.add_argument("-n", "--onlyone", action="store_true")
_parser.add_argument("-o", "--online", action="store_true")
_parser.add_argument("-s", "--something", action="store_true")
_parser.add_argument("-w", "--wikiproject") #other wikimedia project
#language of wiktionary. Default is given at start of this python sctript.
_parser.add_argument("-l", "--lang", default = mylang)
#defaults:
_num = False
#parse args:
_args = _parser.parse_args()
_num = _args.onlyone
_online = _args.online
if len(_args.lang):
_lang = _args.lang
else:
_lang = mylang
if _online:
if _args.articlename == u'':
#TODO report misuse
_online = False
else:
print 'came here?'
exit()
getonearticle(_args.articlename) #will save an xml file as 'onetranslation.txt'
whichwikiproject = u''
if _args.wikiproject <> None:
whichwikiproject = _args.wikiproject
#print _num, _lang, _args.checklatest, _args.getlatest
#exit()
return _num, _lang, _args.checklatest, _args.getlatest, _args.onlyone, _args.something , _args.extractsomething , whichwikiproject
if __name__=="__main__":
g = _getargs()
main(g)#pass language and xml file name