Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/findanything
(Ανακατεύθυνση από Χρήστης:Xoristzatziki/scripts/findanything)
#!/usr/bin/python # -*- coding: utf-8 -*- # #v.0104 """ Checks Translation tables in wiktionaries that use the schema: {{translationtablestart|optionalexplanation}} * {{isocode}} : {{templateforlanguage|isocode|translation}} {{translationtablemiddle}} {{translationtableend}} Possibly using some template for redirecting translations Creates sorted translation tables plus one new which will include all problematic (if such table does not already exists). If table is corrupted reports only the problem and returns! Use to check something in a dump. """ #TODO 3. get language from some conf file or, better, from dump's xml first tag . import os import sys import simplexmlreader as xmlreader import codecs import argparse #import clsfetchnewdump from collections import OrderedDict fproblematic=codecs.open('W_problematictable', 'w', 'utf-8') mylang = u'el' myproject = u'wiki' #mydumpspath = u'/home/wiki/Λήψεις/' mydumpspath = u'/media/FORMANY/wiki/dumps/' RETURNPROBLEMATIC = 1 RETURNOK = 0 atona = u'αεηιουωϊϋ'#.split(',') entona = u'άέήίόύώΐΰ'#.split(',') #general function def extract_lang_iso(wholeline): """Extracts possible iso code from translations ISO code is the first found "Template" in line. """ return wholeline.split('}}')[0].split('{{')[1] def cleartranslationtable(whichtable): ''' Deletes all isos without translation (hidden or visible) from a sorted dict ''' for x in whichtable: if whichtable[x] == '': del whichtable[x] return whichtable def writeunfinished(lemma): with codecs.open('W_unfinished', 'a', 'utf-8') as f: f.write(lemma + u'\n') def extractsomething2(whichlemma, whichtext):#this function is constantly changed to extract something a = whichlemma.decode('utf-8') b = whichtext.decode('utf-8') # _text = b.replace('>', '>') # _text = _text.replace('<', '<') # _text = _text.replace('"', '"') # b = _text.replace('&', '&') #must be last if u'Κατηγορία:Κατάλογοι που αφορούν την Κύπρο' in b: with codecs.open('ΚατΚύπρ.txt', 'a', 'utf-8') as f: f.write( a + u'\n') #print a return 1 return 0 def extractsomething4(whichlemma, whichtext, lemmas): #βρες τα θηλυκά σε "-μένη" που δεν έχουν "μορφή μετοχής" #φυσικά... θα φέρει και όσα έχουν "μορφή επιθέτου" a = whichlemma.decode('utf-8') b = whichtext.decode('utf-8') exeiepitheto = u'*0\t' if a.endswith(u'μένη'): if u'==={{μορφή μετοχής|el}}===' not in b: c = a[:len(a)-1]+u'ος'#το παροξύτονο αρσενικό του #print c #αλλά μπορεί το αρσενικό του να είναι προπαροξύτονο d = a[:len(a)-4] if u'==={{μορφή επιθέτου|el}}===' in b: exeiepitheto = u'*1\t' for xcounter in range(3): #ποιο γράμμα να ελέγξουμε lettertocheck = len(d) - xcounter if d[lettertocheck-1:lettertocheck] in atona: #print lettertocheck #αν έχει φωνήεν (άτονο φυσικά) #φέρε το αντίστοιχο τονισμένο newletter = atona.find(d[lettertocheck-1:lettertocheck]) #αντικατέστησέ το και φτιάξε το "πιθανό προπαροξύτονο αρσενικό" της λέξης d = d[:lettertocheck-1] + entona[newletter:newletter+1] + d[lettertocheck+1:] + u'μενος' #βρήκαμε το φωνήεν break #print d #exit() if c in lemmas:#αν υπάρχει το παροξύτονο αρσενικό δεν θα ελεγχθεί η ύπαρξη προπαροξύτονου with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f: f.write(exeiepitheto + u'1\t[[' + a + u']]\t' + c + u'\n') return 1 if d in lemmas:#αν υπάρχει το προπαροξύτονο αρσενικό with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f: f.write(exeiepitheto + u'2\t[[' + a + u']]\t'+ d + u'\n') return 1 with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f: #δεν υπάρχει ούτε παροξύτονο ούτε προπαροξύτονο #Μπορεί πράγματι να μην υπάρχει (π.χ. "Ωραία Κοιμωμένη"), #αλλά μπορεί απλά να μην έχει καταχωρηθεί. #θα μπορούσαν να μπουν σε διαφορετικό αρχείο αλλά είναι λίγα τελικά... f.write(exeiepitheto + u'3\t[[' + a + u']]\n') #print d #exit() return 1 #print a #return 1 return 0 def extractsomething3(whichlemma, whichtext):#this function is constantly changed to extract something a = whichlemma.decode('utf-8') b = whichtext.decode('utf-8') #c = langsortdata.el.mappedletters langs = {} langssplit1 = b.split('-}}==\n') try: if len(langssplit1)>1: prevlang = langssplit1[0].split('=={{-')[1] prevtext = u'' #print a #print langssplit1 #for text in langssplit1: for xcounter in range(1,len(langssplit1)-1): splitted = langssplit1[xcounter].split('\n=={{-') #print splitted langs[prevlang] = splitted [0] prevlang = splitted[1] #for lang in langs: #print lang return 0, '' elif len(langssplit1) == 0: return 1, a + u'\tno lang\n' else: if b.startswith(u'#ΑΝΑΚΑΤΕΥΘΥΝΣΗ') or b.startswith(u'#REDIRECT'): with codecs.open(fredirect, 'a', 'utf-8') as f: f.write( a + u'\t' + b.split('\n')[0] + u'\n') else: with codecs.open(fother, 'a', 'utf-8') as f: f.write( u'* [[' + a + u']]\t#' + b.split('\n')[0] + u'#\n') return 0, '' except IndexError: return 1, a + u'\tindex error\n' def extractsomething(whichlemma, whichtext):#this function is constantly changed to extract something a = whichlemma.decode('utf-8') b = whichtext.decode('utf-8') if u'{{język starogrecki' in b: # _text = b.replace('>', '>') # _text = _text.replace('<', '<') # _text = _text.replace('"', '"') # b = _text.replace('&', '&') #must be last if u'[[el:' + a + u']]' not in b: with codecs.open('noiwel.txt', 'a', 'utf-8') as f: f.write(a + u'\n') print a if u'{{język nowogrecki' not in b: with codecs.open('noelinpl.txt', 'a', 'utf-8') as f: f.write(a + u'\n') return 1 return 0 class LangStrings(): '''Reads strings of translation table for the language. Creates a sorted dict with iso codes of all known langs to use with translation table. ''' def __init__(self, whichlang): self.langlangs = __import__('wiktlangdata.%s' % whichlang, fromlist=['wiktlangdata']) self.whichlang = whichlang self.langsort = __import__('sorting') self.sortednames = [] self.sortedisos = [] def isocodeisknown(self, isocode): return (isocode in self.langlangs.langnames) def startoftablestring(self): return self.langlangs.startoftablestring def middleoftablestring(self): return self.langlangs.middleoftablestring def endoftablestring(self): return self.langlangs.endoftablestring def redirectionsstring(self): return self.langlangs.redirectionsstring def problemtablestring(self): return self.langlangs.problemtablestring def isosinorder(self): '''Creates an ordered dict with all known iso codes in specified language with empty values for each one of them. Time consuming.... ''' b = OrderedDict(sorted(self.langlangs.langnames.items(), key=lambda t: self.langsort.Word(self.whichlang, self.langlangs.langnames[t[0]]))) #for x in b: #print b[x] #exit() #clear values for x in b: b[x] = u'' self._readydict = b #for x in self._readydict: #self._readydict[x] = 0 #return empty dictionary with lang codes sorted by language names return b class ProcessTranslationsOfLemma(): def __init__(self, whichlang, whichlemma, whichtext,usingLangStrings): """ Constructor """ self.ls = usingLangStrings self.lemma = whichlemma.decode('utf-8') _text = whichtext.decode('utf-8') # unescape characters _text = _text.replace('>', '>') _text = _text.replace('<', '<') _text = _text.replace('"', '"') _text = _text.replace('&', '&') #must be last _trstart = _text.find(self.ls.startoftablestring()) self.notranslationsstart = False if _trstart > 0 : _text =_text[_trstart:] else: if _text.find(self.ls.middleoftablestring()) > -1 or _text.find(self.ls.endoftablestring()) > -1 : pass #no start of tables but middle or end else: self.notranslationsstart = True #print _text #exit() self.lines = _text.splitlines() self.tablescounter = 0 #will hold the number of translation tables found self.translations = {} #will hold which is the "title" of every translation table found self.alltraslationtables = {} # self.onetranslationtable = {} #will hold contents of one found traslation table #iso code as key and full line as value self.doubletranslation = False self.started = False self.tablewithproblems = {} #single table that will hold all problems found in all tables self.problemsfoundintable = 0 def _processpossibletranslation(self, oneline): """Checks if a translation line for the ISO lang exists and if is sorted Assumes a line with hidden or normal translation """ langISO = extract_lang_iso(oneline) if self.ls.isocodeisknown(langISO): if langISO in self.onetranslationtable: #print 'double: ',langISO, ' (' , self.lemma,')' self.problemsfoundintable += 1 #self.tablewithproblems[u'Pr-Dbl-' + unicode(self.problemsfoundintable)] = oneline #debuging problems self.tablewithproblems[ unicode(self.problemsfoundintable)] = oneline else: self.onetranslationtable[langISO] = oneline #if oneline.startswith(u'*'): #self.ls._readydict[langISO] += 1 else: #print 'unknown iso: ',langISO, ' (' , self.lemma,')' self.problemsfoundintable += 1 #self.tablewithproblems[u'Pr-iso-' + unicode(self.problemsfoundintable)] = oneline #debuging problems self.tablewithproblems[unicode(self.problemsfoundintable)] = oneline def _isdummyline(self, oneline): if oneline.startswith(self.ls.middleoftablestring()):#center of table return True elif oneline.strip() == '':#empty line return True else: return False def findandchecktranslations(self): """Checks translations of one entry for errors. Processes lines. Tables are contained in a "self" dict named alltraslationtables. Their keys are numbers from counter. Each item of the dict contains a tuple with title found, another dict with translations and another dict with problems. Dict with translations contains lines of that table and iso codes as keys. Since redirections do not have isocode any "table" dict that has a redirection contains only a key '0' and the line. Dict with problems also does not have isocodes so his members are counted and counter is used as key. """ #if self.notranslationsstart: #return tablesfound = 0 tabletitle = u'' self.problemsfoundintable = 0 self.alltraslationtables = {} self.tablewithproblems = {} self.onetranslationtable = OrderedDict() #only one line with redirection self.redirectionstr = u'' redirectfound = False if self.notranslationsstart: return self.alltraslationtables for oneline in self.lines: if oneline.startswith(self.ls.startoftablestring()):#start of a translation table if self.started:#report problem and return. Do not search that title untill problem is fixed #fproblematic.write(thislemma) #print 'table not ended________________ ', ' (' , self.lemma,')' with codecs.open('unfinished.txt', 'a', 'utf-8') as fproblem: fproblem.write(self.lemma + u'\n') return RETURNPROBLEMATIC tablesfound += 1 #increase table counter tabletitle = oneline #table "title" self.problemsfoundintable = 0 #restart counting problems self.started = True #set that a table has begun self.onetranslationtable = OrderedDict() #clear translation table self.tablewithproblems = {} #clear table with probles for the above translation table self.redirectionstr = u'' redirectfound = False #TODO #elif oneline.startswith(self.ls.problemtablestring):#start of a table with problems elif oneline.startswith(self.ls.endoftablestring()):#found end of table if not self.started:#report problem and return. Do not search that title untill problem is fixed #fproblematic.write(thislemma) print 'table not started.......... ', ' (' , self.lemma,')' with codecs.open('notstarted.txt', 'a', 'utf-8') as fproblem: fproblem.write(self.lemma + u'\n') return RETURNPROBLEMATIC else:#end of that traslation table #add the table to translation tables if len(self.tablewithproblems): b = self.ls.problemtablestring, self.tablewithproblems else: b = None if redirectfound: c = self.redirectionstr else: c = None #tablesorted = self._sorttables() self.alltraslationtables[unicode(tablesfound)] = tabletitle, self.onetranslationtable, b, c #not inside a table after here self.started = False elif self.started: #TODO change it to specific lang start if oneline.startswith(u'* {{') or oneline.startswith(u'<!-- * {{'): #possible translation visible or hidden self._processpossibletranslation(oneline) elif self._isdummyline(oneline): #nothing to do. Do not add that line. continue elif oneline.startswith(self.ls.redirectionsstring()): #line with redirection if redirectfound: print 'table with at least two redirections________________ ', ' (' , self.lemma,')' with codecs.open('tworedirs.txt', 'a', 'utf-8') as fproblem: fproblem.write(self.lemma + u'\n') return RETURNPROBLEMATIC else: redirectfound = True #add the line self.redirectionstr = oneline else: #problematic line, add it to problematic table #increase problem counter for that translation table self.problemsfoundintable += 1 #self.tablewithproblems[u'Pr-other-' + unicode(self.problemsfoundintable)] = oneline #debuging problems self.tablewithproblems[unicode(self.problemsfoundintable)] = oneline else: #not inside a table pass #all lines searched if self.started: #last table has not been closed #print 'table not ended________________ ', ' (' , self.lemma,')' with codecs.open('unfinished.txt', 'a', 'utf-8') as fproblem: fproblem.write(self.lemma + u'\n') return RETURNPROBLEMATIC return self.alltraslationtables def checktranslation(langofwiktionary, lemma, lemmatext, usingLangStrings): #print "Working on..." , lemma b = usingLangStrings #get sorted isocodes #emptysortedisos = b.isosinorder() #exit() mycenter = b.middleoftablestring() myend = b.endoftablestring() m = ProcessTranslationsOfLemma(langofwiktionary, lemma, lemmatext, b) lemmaalltables = m.findandchecktranslations() if lemmaalltables == RETURNPROBLEMATIC: print 'problematic tables in ', lemma elif len(lemmaalltables): #print '============================ START OF TABLES FOR: ', lemma , ' ==============================' for tablecounter, onetable in enumerate(sorted(lemmaalltables)): #print '===== Table num: ', tablecounter + 1, '=====' tabletitle = lemmaalltables[onetable][0] unsortedtablewithtranslations = lemmaalltables[onetable][1] hasproblems = (lemmaalltables[onetable][2] != None) if hasproblems: tablewithproblemsstring = lemmaalltables[onetable][2][0] tablewithproblems = lemmaalltables[onetable][2][1] tablewithredirect = lemmaalltables[onetable][3] print 'Tablestart: ' middleoftable = - (len(unsortedtablewithtranslations) / -2) linesprintedcounter = 0 print tabletitle #sort table and insert Template for middle of table sortedtable = OrderedDict(b._readydict) unsortedtablewithtranslations = cleartranslationtable(unsortedtablewithtranslations) print sortedtable print unsortedtablewithtranslations for x in unsortedtablewithtranslations: sortedtable[x] = unsortedtablewithtranslations[x] print 'sorted...' sortedtable = cleartranslationtable(sortedtable) #for x in sortedtable: #print sortedtable[x] #print 'end sorted..' #sortedtable = cleartranslationtable(sortedtable) #oldtable = [] #newtable = [] #linesprintedcounter = 0 #for x in sortedtable: #if sortedtable[x] <> u'': #linesprintedcounter += 1 # newtable.append(sortedtable[x]) #if linesprintedcounter == middleoftable: #newtable.append(mycenter) #for x in unsortedtablewithtranslations: #oldtable.append(sortedtable[x]) #if oldtable == newtable: #print '\tTable is sorted (place of "Template for center" is not checked)' #exit() #else: #print tabletitle , ' is unsorted or with problems' #linesprintedcounter = 0 #print newtable #for x in newtable: #linesprintedcounter += 1 #print x #if linesprintedcounter == middleoftable: #print mycenter #print myend print tabletitle , ' is unsorted or with problems' linesprintedcounter = 0 for x in sortedtable: linesprintedcounter += 1 print sortedtable[x] if linesprintedcounter == middleoftable: print mycenter print myend print 'Table end: ', tablecounter + 1, '---------' if tablewithredirect != None: #pass print 'Table with redirect start: ', tabletitle print tablewithredirect print myend print 'end table with redirect: ', tablecounter, '---------' #exit() if hasproblems: #pass middleoftable = - (len(tablewithproblems) / -2) linesprintedcounter = 0 print 'Table with problems start: ????????' print b.problemtablestring() print tablewithproblems for tableline in tablewithproblems: if linesprintedcounter == middleoftable: print mycenter #exit() print tableline, tablewithproblems[tableline] #'.....', tableline, lemmaalltables[x][1][tableline] linesprintedcounter += 1 print myend print 'end table with problems. ', '????????' #exit() print '============================ END OF TABLES FOR: ', lemma , ' ==============================' #else: #print '============================ no translation tables for: ', lemma def main2(langofwiktionary): with codecs.open('translation.txt', 'r', 'utf-8') as f: onetransltext = f.read() #for checking class? def main(passedargs): langofwiktionary = passedargs[1] #if not passedargs[5]: try: usingLangStrings = LangStrings(langofwiktionary) #next statement will also create an empty sorted _readydict sortedisos = usingLangStrings.isosinorder() except: usingLangStrings = None try: if passedargs[4]:#just read a file with translations #with codecs.open('onetranslation.txt', 'r', 'utf-8') as f: with open('onetranslation.txt', 'r') as f: onetransltext = f.read() checktranslation(langofwiktionary, 'dummy', onetransltext, usingLangStrings) else: #print passedargs #exit() if len(passedargs[7]): dump = xmlreader.WikiDump(langofwiktionary, passedargs[7], mydumpspath, passedargs[2], passedargs[3]) else: dump = xmlreader.WikiDump(langofwiktionary, u'wiktionary', mydumpspath, passedargs[2], passedargs[3]) #dump = xmlreader.XmlDump(passedargs[2]) if dump._ISOK: #βρίσκουμε το αρχείο με τους τίτλους σκέτους dumpfilename, titlesfilename = dump.workingdump_filenames() with codecs.open(titlesfilename, 'r', 'utf-8') as ftemp: fulllines = ftemp.readlines() #δημιουργούμε ένα list μόνο με τους τίτλους (χωρίς τη θέση κλπ) onlylemmas = [] for line in fulllines: onlylemmas.append(line.split('<title>')[1].split('</title>')[0]) lemmascounter = 0 found = 0 for entry in dump.parse(): #print entry.ns lemmascounter += 1 if(entry.ns == '0'): #print entry.title if passedargs[6]:#αν θέλει να βρει κάτι μπορεί να χρησιμοποιήσει και το αρχείο των τίτλων foundsomething = extractsomething4(entry.title, entry.text, onlylemmas) if foundsomething: found += 1 print found, '/', lemmascounter #with codecs.open('el-Κατηγορίες.txt', 'a', 'utf-8') as f: #f.write( entry.title.decode('utf-8') + '\n') #found += 1 else: checktranslation(langofwiktionary, entry.title, entry.text, usingLangStrings) #alltrnasls = sorted(usingLangStrings._readydict, key=lambda t: t[0]])) #for x in usingLangStrings._readydict: #print usingLangStrings.langlangs.langnames[x], ':', x, ':', usingLangStrings._readydict[x] else: print 'not all ok' finally: fproblematic.close() print "Έτοιμα!" def getonearticle(): print 'came here to get one one article...' exit() def _getargs(): '''Parameters: -n, --onlyone means we want to check a simple translation table in a text file named onetranslation.txt -l, --lang (=a language iso code) defaults to that lang as lang of wiktionary -c, --checklatest if a newer dump exists, get it -g, --getlatest get latest dump ''' _parser = argparse.ArgumentParser() #if user is online may want to check for newer dump. #the default xml filename is given at start of this python sctript. _parser.add_argument("-a", "--articlename", default = u'') _parser.add_argument("-c", "--checklatest", action="store_true") _parser.add_argument("-e", "--extractsomething", action="store_true") _parser.add_argument("-g", "--getlatest", action="store_true") _parser.add_argument("-n", "--onlyone", action="store_true") _parser.add_argument("-o", "--online", action="store_true") _parser.add_argument("-s", "--something", action="store_true") _parser.add_argument("-w", "--wikiproject") #other wikimedia project #language of wiktionary. Default is given at start of this python sctript. _parser.add_argument("-l", "--lang", default = mylang) #defaults: _num = False #parse args: _args = _parser.parse_args() _num = _args.onlyone _online = _args.online if len(_args.lang): _lang = _args.lang else: _lang = mylang if _online: if _args.articlename == u'': #TODO report misuse _online = False else: print 'came here?' exit() getonearticle(_args.articlename) #will save an xml file as 'onetranslation.txt' whichwikiproject = u'' if _args.wikiproject <> None: whichwikiproject = _args.wikiproject #print _num, _lang, _args.checklatest, _args.getlatest #exit() return _num, _lang, _args.checklatest, _args.getlatest, _args.onlyone, _args.something , _args.extractsomething , whichwikiproject if __name__=="__main__": g = _getargs() main(g)#pass language and xml file name