Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/findanything

#!/usr/bin/python
# -*- coding: utf-8  -*-
#
#v.0104
"""
Checks Translation tables in wiktionaries that use the schema:
{{translationtablestart|optionalexplanation}}
* {{isocode}} : {{templateforlanguage|isocode|translation}}
{{translationtablemiddle}}
{{translationtableend}}

Possibly using some template for redirecting translations

Creates sorted translation tables plus one new which will include all problematic
(if such table does not already exists).

If table is corrupted reports only the problem and returns!

Use to check something in a dump.
"""
#TODO 3. get language from some conf file or, better, from dump's xml first tag . 

import os
import sys
import simplexmlreader as xmlreader
import codecs
import argparse
#import clsfetchnewdump
from collections import OrderedDict

fproblematic=codecs.open('W_problematictable', 'w', 'utf-8')


mylang = u'el'
myproject = u'wiki'

#mydumpspath = u'/home/wiki/Λήψεις/'
mydumpspath = u'/media/FORMANY/wiki/dumps/'


RETURNPROBLEMATIC = 1
RETURNOK = 0 

atona = u'αεηιουωϊϋ'#.split(',')
entona = u'άέήίόύώΐΰ'#.split(',')
#general function
def extract_lang_iso(wholeline):
    """Extracts possible iso code from translations
    
        ISO code is the first found "Template" in line.
    """
    return wholeline.split('}}')[0].split('{{')[1]

def cleartranslationtable(whichtable):
    '''
    Deletes all isos without translation (hidden or visible)
    from a sorted dict
    '''
    for x in whichtable:
        if whichtable[x] == '':
            del whichtable[x]
    return whichtable

def writeunfinished(lemma):
    with codecs.open('W_unfinished', 'a', 'utf-8') as f:
        f.write(lemma + u'\n')
        
def extractsomething2(whichlemma, whichtext):#this function is constantly changed to extract something 
    a = whichlemma.decode('utf-8')
    b = whichtext.decode('utf-8')
#        _text = b.replace('>', '>')
#        _text = _text.replace('<', '<')
#        _text = _text.replace('"', '"')
#        b = _text.replace('&', '&') #must be last 
    if u'Κατηγορία:Κατάλογοι που αφορούν την Κύπρο' in b:
        with codecs.open('ΚατΚύπρ.txt', 'a', 'utf-8') as f:
            f.write( a + u'\n')
        #print a
        return 1
    return 0

def extractsomething4(whichlemma, whichtext, lemmas):
    #βρες τα θηλυκά σε "-μένη" που δεν έχουν "μορφή μετοχής"
    #φυσικά... θα φέρει και όσα έχουν "μορφή επιθέτου"
    a = whichlemma.decode('utf-8')
    b = whichtext.decode('utf-8')
    exeiepitheto = u'*0\t'
    if a.endswith(u'μένη'):
        if u'==={{μορφή μετοχής|el}}===' not in b:
            c = a[:len(a)-1]+u'ος'#το παροξύτονο αρσενικό του
            #print c
            #αλλά μπορεί το αρσενικό του να είναι προπαροξύτονο
            d = a[:len(a)-4]
            if u'==={{μορφή επιθέτου|el}}===' in b:
                exeiepitheto = u'*1\t'
            for xcounter in range(3):
                #ποιο γράμμα να ελέγξουμε
                lettertocheck = len(d) - xcounter 
                if d[lettertocheck-1:lettertocheck] in atona:
                    #print lettertocheck
                    #αν έχει φωνήεν (άτονο φυσικά)
                    #φέρε το αντίστοιχο τονισμένο
                    newletter = atona.find(d[lettertocheck-1:lettertocheck])
                    #αντικατέστησέ το και φτιάξε το "πιθανό προπαροξύτονο αρσενικό" της λέξης
                    d = d[:lettertocheck-1] + entona[newletter:newletter+1] + d[lettertocheck+1:] + u'μενος'
                    #βρήκαμε το φωνήεν
                    break
                    #print d
                    #exit()
            if c in lemmas:#αν υπάρχει το παροξύτονο αρσενικό δεν θα ελεγχθεί η ύπαρξη προπαροξύτονου
                with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f:
                    f.write(exeiepitheto + u'1\t[[' + a + u']]\t' + c + u'\n')
                    return 1                    
            if d in lemmas:#αν υπάρχει το προπαροξύτονο αρσενικό
                with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f:
                    f.write(exeiepitheto + u'2\t[[' + a + u']]\t'+ d + u'\n')
                    return 1
            with codecs.open('θηλμετοχ.txt', 'a', 'utf-8') as f:
                #δεν υπάρχει ούτε παροξύτονο ούτε προπαροξύτονο
                #Μπορεί πράγματι να μην υπάρχει (π.χ. "Ωραία Κοιμωμένη"),
                #αλλά μπορεί απλά να μην έχει καταχωρηθεί.
                #θα μπορούσαν να μπουν σε διαφορετικό αρχείο αλλά είναι λίγα τελικά...                
                f.write(exeiepitheto + u'3\t[[' + a + u']]\n')
                #print d
                #exit()
                return 1                        
            #print a
            #return 1
    return 0
    

def extractsomething3(whichlemma, whichtext):#this function is constantly changed to extract something 
    a = whichlemma.decode('utf-8')
    b = whichtext.decode('utf-8')
    #c = langsortdata.el.mappedletters
    langs = {}
    langssplit1 = b.split('-}}==\n')
    try:
        if len(langssplit1)>1:
            prevlang = langssplit1[0].split('=={{-')[1]
            prevtext = u''
            #print a
            #print langssplit1 
            #for text in langssplit1:            
            for xcounter in range(1,len(langssplit1)-1):
                splitted = langssplit1[xcounter].split('\n=={{-')
                #print splitted
                langs[prevlang] = splitted [0] 
                prevlang = splitted[1]
            #for lang in langs:
                #print lang
            return 0, ''
        elif len(langssplit1) == 0:
            return 1, a + u'\tno lang\n'
        else:
            if b.startswith(u'#ΑΝΑΚΑΤΕΥΘΥΝΣΗ') or b.startswith(u'#REDIRECT'):            
                with codecs.open(fredirect, 'a', 'utf-8') as f:
                    f.write( a + u'\t' + b.split('\n')[0] + u'\n')
            else:
                with codecs.open(fother, 'a', 'utf-8') as f:
                    f.write( u'* [[' + a + u']]\t#' + b.split('\n')[0] + u'#\n')            
            return 0, ''
    except IndexError:
        return 1, a + u'\tindex error\n'

        
def extractsomething(whichlemma, whichtext):#this function is constantly changed to extract something 
    a = whichlemma.decode('utf-8')
    b = whichtext.decode('utf-8')
    if u'{{język starogrecki' in b:
#        _text = b.replace('>', '>')
#        _text = _text.replace('<', '<')
#        _text = _text.replace('"', '"')
#        b = _text.replace('&', '&') #must be last 
        if u'[[el:' + a + u']]' not in b:
            with codecs.open('noiwel.txt', 'a', 'utf-8') as f:
                f.write(a + u'\n')
            print a
        if u'{{język nowogrecki' not in b:
            with codecs.open('noelinpl.txt', 'a', 'utf-8') as f:
                f.write(a + u'\n')
            return 1
    return 0
        
class LangStrings():
    '''Reads strings of translation table for the language.
    
        Creates a sorted dict with iso codes of all known langs
        to use with translation table.
    '''
    def __init__(self, whichlang):    
        self.langlangs = __import__('wiktlangdata.%s' % whichlang, fromlist=['wiktlangdata'])  
        self.whichlang = whichlang      
        self.langsort = __import__('sorting')
        self.sortednames = []
        self.sortedisos = []

    def isocodeisknown(self, isocode):
        return (isocode in self.langlangs.langnames)
          
    def startoftablestring(self):
        return self.langlangs.startoftablestring
                
    def middleoftablestring(self):
        return self.langlangs.middleoftablestring        
        
    def endoftablestring(self):
        return self.langlangs.endoftablestring        
        
    def redirectionsstring(self):
        return self.langlangs.redirectionsstring        
        
    def problemtablestring(self):
        return self.langlangs.problemtablestring        
        
    def isosinorder(self):
        '''Creates an ordered dict with all known iso codes
        in specified language with empty values for each one of them.                
        
            Time consuming....
        '''        
        b = OrderedDict(sorted(self.langlangs.langnames.items(), key=lambda t: self.langsort.Word(self.whichlang, self.langlangs.langnames[t[0]])))
        #for x in b:
            #print b[x] 
        #exit()
        #clear values
        for x in b:
            b[x] = u''
        self._readydict = b
        #for x in self._readydict:
            #self._readydict[x] = 0
        #return empty dictionary with lang codes sorted by language names
        return b
     
class ProcessTranslationsOfLemma():
    def __init__(self, whichlang, whichlemma, whichtext,usingLangStrings):    
        """ Constructor
                        
        """   
        self.ls =  usingLangStrings    
        
        self.lemma = whichlemma.decode('utf-8')
        _text = whichtext.decode('utf-8')
        # unescape characters
        _text = _text.replace('>', '>')
        _text = _text.replace('<', '<')
        _text = _text.replace('"', '"')
        _text = _text.replace('&', '&') #must be last 
        _trstart = _text.find(self.ls.startoftablestring())
        self.notranslationsstart = False
        if _trstart > 0 :
            _text =_text[_trstart:] 
        else:
            if _text.find(self.ls.middleoftablestring()) > -1 or _text.find(self.ls.endoftablestring()) > -1 :
                pass #no start of tables but middle or end
            else:
                self.notranslationsstart = True
                #print _text
                #exit()           
        self.lines = _text.splitlines()
        self.tablescounter = 0 #will hold the number of translation tables found
        self.translations = {} #will hold which is the "title" of every translation table found
        self.alltraslationtables = {} #
        self.onetranslationtable = {} #will hold contents of one found traslation table
                                    #iso code as key and full line as value

        self.doubletranslation = False
        self.started = False
        self.tablewithproblems = {} #single table that will hold all problems found in all tables
        self.problemsfoundintable = 0

    def _processpossibletranslation(self, oneline):
        """Checks if a translation line for the ISO lang exists and if is sorted
        
            Assumes a line with hidden or normal translation
        """
        langISO = extract_lang_iso(oneline)
        if self.ls.isocodeisknown(langISO):
            if langISO in self.onetranslationtable:
                #print 'double: ',langISO, '  (' , self.lemma,')'
                self.problemsfoundintable += 1
                #self.tablewithproblems[u'Pr-Dbl-' + unicode(self.problemsfoundintable)] = oneline #debuging problems
                self.tablewithproblems[ unicode(self.problemsfoundintable)] = oneline
            else:            
                self.onetranslationtable[langISO] = oneline 
                #if oneline.startswith(u'*'):
                    #self.ls._readydict[langISO] += 1
        else:
            #print 'unknown iso: ',langISO, '  (' , self.lemma,')'
            self.problemsfoundintable += 1
            #self.tablewithproblems[u'Pr-iso-' + unicode(self.problemsfoundintable)] = oneline #debuging problems
            self.tablewithproblems[unicode(self.problemsfoundintable)] = oneline
            
    def _isdummyline(self, oneline):
        if oneline.startswith(self.ls.middleoftablestring()):#center of table
            return True
        elif oneline.strip() == '':#empty line
            return True
        else:
            return False                
        
    def findandchecktranslations(self):
        """Checks translations of one entry for errors.
        
            Processes lines. Tables are contained in a "self" dict named alltraslationtables.
            Their keys are numbers from counter.
            Each item of the dict contains a tuple with title found, another dict with translations
            and another dict with problems.
            Dict with translations contains lines of that table and iso codes as keys.
            Since redirections do not have isocode
            any "table" dict that has a redirection contains only a key '0' and the line.
            Dict with problems also does not have isocodes so his members are counted
            and counter is used as key.
        """ 
        #if self.notranslationsstart:
            #return 
        tablesfound = 0
        tabletitle = u'' 
        self.problemsfoundintable = 0 
        self.alltraslationtables = {}
        self.tablewithproblems = {}
        self.onetranslationtable = OrderedDict()
        #only one line with redirection
        self.redirectionstr = u''
        redirectfound = False
        if self.notranslationsstart:
            return self.alltraslationtables
        for oneline in self.lines:
            if oneline.startswith(self.ls.startoftablestring()):#start of a translation table
                if self.started:#report problem and return. Do not search that title untill problem is fixed
                    #fproblematic.write(thislemma)
                    #print 'table not ended________________ ', '  (' , self.lemma,')'
                    with codecs.open('unfinished.txt', 'a', 'utf-8') as fproblem:
                        fproblem.write(self.lemma + u'\n')
                    return RETURNPROBLEMATIC                    
                tablesfound += 1 #increase table counter
                tabletitle =  oneline #table "title"
                self.problemsfoundintable = 0 #restart counting problems
                self.started = True #set that a table has begun
                self.onetranslationtable = OrderedDict() #clear translation table
                self.tablewithproblems = {} #clear table with probles for the above translation table
                self.redirectionstr = u''
                redirectfound = False
            #TODO 
            #elif oneline.startswith(self.ls.problemtablestring):#start of a table with problems
                
            elif oneline.startswith(self.ls.endoftablestring()):#found end of table            
                if not self.started:#report problem and return. Do not search that title untill problem is fixed
                    #fproblematic.write(thislemma)
                    print 'table not started.......... ', '  (' , self.lemma,')'
                    with codecs.open('notstarted.txt', 'a', 'utf-8') as fproblem:
                        fproblem.write(self.lemma + u'\n')
                    return RETURNPROBLEMATIC 
                else:#end of that traslation table
                    #add the table to translation tables
                    if len(self.tablewithproblems):
                        b = self.ls.problemtablestring, self.tablewithproblems
                    else:
                        b = None
                    if redirectfound:
                        c = self.redirectionstr
                    else:
                        c = None
                    #tablesorted = self._sorttables()                    
                    self.alltraslationtables[unicode(tablesfound)] = tabletitle, self.onetranslationtable, b, c
                    #not inside a table after here
                    self.started = False                
            elif self.started:
                #TODO change it to specific lang start
                if oneline.startswith(u'* {{') or oneline.startswith(u'<!-- * {{'):
                    #possible translation visible or hidden
                    self._processpossibletranslation(oneline)
                elif self._isdummyline(oneline):
                    #nothing to do. Do not add that line.
                    continue
                elif oneline.startswith(self.ls.redirectionsstring()):
                    #line with redirection
                    if redirectfound:
                        print 'table with at least two redirections________________ ', '  (' , self.lemma,')'
                        with codecs.open('tworedirs.txt', 'a', 'utf-8') as fproblem:
                            fproblem.write(self.lemma + u'\n')
                        return RETURNPROBLEMATIC                    
                    else:
                        redirectfound = True
                    #add the line
                    self.redirectionstr = oneline                   
                else: #problematic line, add it to problematic table
                    #increase problem counter for that translation table
                    self.problemsfoundintable += 1
                    #self.tablewithproblems[u'Pr-other-' + unicode(self.problemsfoundintable)] = oneline #debuging problems
                    self.tablewithproblems[unicode(self.problemsfoundintable)] = oneline
            else: #not inside a table
                pass

        #all lines searched
        if self.started:
            #last table has not been closed
            #print 'table not ended________________ ', '  (' , self.lemma,')'
            with codecs.open('unfinished.txt', 'a', 'utf-8') as fproblem:
                fproblem.write(self.lemma + u'\n')
            return RETURNPROBLEMATIC                    
        return self.alltraslationtables
        

def checktranslation(langofwiktionary, lemma, lemmatext, usingLangStrings):
    #print "Working on..." , lemma                      
    b = usingLangStrings
    #get sorted isocodes
    #emptysortedisos = b.isosinorder() 
    #exit() 
    mycenter = b.middleoftablestring()
    myend = b.endoftablestring()                
    m = ProcessTranslationsOfLemma(langofwiktionary, lemma, lemmatext, b)
    lemmaalltables = m.findandchecktranslations()
    if lemmaalltables == RETURNPROBLEMATIC:
        print 'problematic tables in ', lemma
    elif len(lemmaalltables):
        #print '============================ START OF TABLES FOR: ', lemma , ' =============================='
        for tablecounter, onetable in enumerate(sorted(lemmaalltables)):                                                   
            #print '===== Table num: ', tablecounter + 1, '====='
            tabletitle = lemmaalltables[onetable][0]
            unsortedtablewithtranslations = lemmaalltables[onetable][1]
            hasproblems = (lemmaalltables[onetable][2] != None)
            if hasproblems:
                tablewithproblemsstring = lemmaalltables[onetable][2][0]
                tablewithproblems = lemmaalltables[onetable][2][1]
            tablewithredirect = lemmaalltables[onetable][3]
            print 'Tablestart: '
            middleoftable = - (len(unsortedtablewithtranslations) / -2)
            linesprintedcounter = 0
            print tabletitle 
            #sort table and insert Template for middle of table
            sortedtable = OrderedDict(b._readydict)
            unsortedtablewithtranslations = cleartranslationtable(unsortedtablewithtranslations)
            print sortedtable            
            print unsortedtablewithtranslations            
            for x in unsortedtablewithtranslations:
                sortedtable[x] = unsortedtablewithtranslations[x]
            print 'sorted...'
            sortedtable = cleartranslationtable(sortedtable)
            #for x in sortedtable:
                #print sortedtable[x]
            #print 'end sorted..'
            #sortedtable = cleartranslationtable(sortedtable)
            #oldtable = []
            #newtable = []
            #linesprintedcounter = 0
            #for x in sortedtable:
                #if sortedtable[x] <> u'':
                    #linesprintedcounter += 1
                   # newtable.append(sortedtable[x])                                   
                    #if linesprintedcounter == middleoftable:
                        #newtable.append(mycenter)
            #for x in unsortedtablewithtranslations:
                #oldtable.append(sortedtable[x])                                   
            #if oldtable == newtable:
                #print '\tTable is sorted (place of "Template for center" is not checked)'
                #exit()
            #else:
                #print tabletitle , ' is unsorted or with problems'
                #linesprintedcounter = 0 
                #print newtable               
                #for x in newtable:
                    #linesprintedcounter += 1
                    #print x
                    #if linesprintedcounter == middleoftable:
                        #print mycenter                                               
                #print myend                                                   
            print tabletitle , ' is unsorted or with problems'
            linesprintedcounter = 0 
            for x in sortedtable:
                linesprintedcounter += 1
                print sortedtable[x]
                if linesprintedcounter == middleoftable:
                    print mycenter                                               
            print myend                                                   
            print 'Table end: ', tablecounter + 1, '---------'
            if tablewithredirect != None:
                #pass
                print 'Table with redirect start: ', tabletitle
                print tablewithredirect
                print myend
                print 'end table with redirect: ', tablecounter, '---------'
                #exit()
            if hasproblems:
                #pass
                middleoftable = - (len(tablewithproblems) / -2)
                linesprintedcounter = 0
                print 'Table with problems start: ????????'
                print b.problemtablestring()
                print tablewithproblems
                for tableline in tablewithproblems:
                    if  linesprintedcounter == middleoftable:
                        print mycenter 
                        #exit()                                                                    
                    print tableline, tablewithproblems[tableline] #'.....', tableline, lemmaalltables[x][1][tableline]
                    linesprintedcounter += 1 
                print myend
                print 'end table with problems. ', '????????'
                #exit()
                
        print '============================ END OF TABLES FOR: ', lemma , ' =============================='
    #else:
        #print '============================ no translation tables for: ',  lemma
              
def main2(langofwiktionary):

    with codecs.open('translation.txt', 'r', 'utf-8') as f:
        onetransltext = f.read()

           
#for checking class?
def main(passedargs):
    langofwiktionary = passedargs[1]
    #if not passedargs[5]:
    try:
        usingLangStrings = LangStrings(langofwiktionary)
    #next statement will also create an empty sorted _readydict
        sortedisos =  usingLangStrings.isosinorder()
    except:
        usingLangStrings = None
    try:
        if passedargs[4]:#just read a file with translations
            #with codecs.open('onetranslation.txt', 'r', 'utf-8') as f:
            with open('onetranslation.txt', 'r') as f:
                onetransltext = f.read()
                checktranslation(langofwiktionary, 'dummy', onetransltext, usingLangStrings)
        else:
            #print passedargs
            #exit()
            if len(passedargs[7]):
                dump = xmlreader.WikiDump(langofwiktionary, passedargs[7], mydumpspath, passedargs[2], passedargs[3])
            else:
                dump = xmlreader.WikiDump(langofwiktionary, u'wiktionary', mydumpspath, passedargs[2], passedargs[3])
            #dump = xmlreader.XmlDump(passedargs[2])                        
            if dump._ISOK:
                #βρίσκουμε το αρχείο με τους τίτλους σκέτους
                dumpfilename, titlesfilename = dump.workingdump_filenames()
                with codecs.open(titlesfilename, 'r', 'utf-8') as ftemp:
                    fulllines = ftemp.readlines()
                #δημιουργούμε ένα list μόνο με τους τίτλους (χωρίς τη θέση κλπ)
                onlylemmas = []
                for line in fulllines:
                    onlylemmas.append(line.split('<title>')[1].split('</title>')[0])                    
                lemmascounter = 0
                found = 0 
                for entry in dump.parse(): 
                    #print entry.ns
                    lemmascounter += 1
                    if(entry.ns == '0'): 
                        #print entry.title                        
                        if passedargs[6]:#αν θέλει να βρει κάτι μπορεί να χρησιμοποιήσει και το αρχείο των τίτλων
                            foundsomething = extractsomething4(entry.title, entry.text, onlylemmas)
                            if foundsomething:
                                found += 1
                                print found, '/', lemmascounter
                            #with codecs.open('el-Κατηγορίες.txt', 'a', 'utf-8') as f:
                                #f.write( entry.title.decode('utf-8') + '\n')
                            #found += 1
                            
                        else:
                            checktranslation(langofwiktionary, entry.title, entry.text, usingLangStrings)
                #alltrnasls = sorted(usingLangStrings._readydict, key=lambda t: t[0]]))
                #for x in usingLangStrings._readydict:
                    #print usingLangStrings.langlangs.langnames[x], ':', x, ':', usingLangStrings._readydict[x]
            else:
                print 'not all ok'                          
    finally:
        fproblematic.close()
        print "Έτοιμα!"        

def getonearticle():
    print 'came here to get one one article...'
    exit()
        
def _getargs():
    '''Parameters:
    -n, --onlyone means we want to check a simple translation table
    in a text file named onetranslation.txt
    
    -l, --lang (=a language iso code) defaults to that lang
    as lang of wiktionary
    
    -c, --checklatest if a newer dump exists, get it
    -g, --getlatest get latest dump
    
    '''
    _parser = argparse.ArgumentParser()
    #if user is online may want to check for newer dump.
    #the default xml filename is given at start of this python sctript.
    _parser.add_argument("-a", "--articlename",  default = u'')
    _parser.add_argument("-c", "--checklatest", action="store_true")
    _parser.add_argument("-e", "--extractsomething", action="store_true")
    _parser.add_argument("-g", "--getlatest", action="store_true")
    _parser.add_argument("-n", "--onlyone", action="store_true")
    _parser.add_argument("-o", "--online", action="store_true")
    _parser.add_argument("-s", "--something", action="store_true")
    _parser.add_argument("-w", "--wikiproject") #other wikimedia project
    
    #language of wiktionary. Default is given at start of this python sctript.
    _parser.add_argument("-l", "--lang",  default = mylang)
    #defaults:
    _num = False
    #parse args:
    _args = _parser.parse_args()
    _num = _args.onlyone
    _online = _args.online
    if len(_args.lang):
        _lang = _args.lang
    else:
        _lang = mylang
    if _online:
        if _args.articlename == u'':
            #TODO report misuse
            _online = False
        else:
            print 'came here?'
            exit()
            getonearticle(_args.articlename) #will save an xml file as 'onetranslation.txt'
    whichwikiproject = u''
    if _args.wikiproject <> None:
        whichwikiproject = _args.wikiproject
    #print   _num, _lang, _args.checklatest, _args.getlatest
    #exit()    
    return _num, _lang, _args.checklatest, _args.getlatest, _args.onlyone, _args.something , _args.extractsomething , whichwikiproject            

if __name__=="__main__":
    g = _getargs()
    main(g)#pass language and xml file name