Χρήστης:Flubot/update t

Αυτό το πρόγραμμα είναι αντίγραφο αυτού που έγραψε ο Ariel Glenn για τον έλεγχο των μεταφράσεων (βρίσκεται εδώ).
Η διαφορά βρίσκεται στο ότι χρησιμοποιεί ως πηγή για να ελέγξει τις μεταφράσεις το αρχείο all-lemmas-sorted.txt που παράγεται από το πρόγραμμα για τους διαγλωσσικούς συνδέσμους (interwikis) (δείτε Χρήστης:AtouBot/generate-iw-actionlist.sh). Το αρχείο αυτό βρίσκεται στον υποκατάλογο iw_tmp του φακέλου στον οποίο έχουμε εγκαταστήσει το generate-iw-actionlist.sh και πρέπει να αντιγραφεί στο φάκελο pywikipedia μαζί με αυτό εδώ το script.
Μετά από συμβουλή του Ariel, έκανα και κάποιες αλλαγές στη δομή της μεταβλητής wordList (πίνακας τύπου hash).
update_t-v2.py επεξεργασία

#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot goes over multiple pages of the home wiki, checks all
references to {{τ}} and updates the noentry parameter, 
according to whether they have an entry on the remote wiki
or not.  

Details of operation:

  It does not bother {{τ}} in html comments. Note that we rely on the 
  fact that our html comments which hide empty translation
  templates don't encompass more than one line.

  If a line is only partially commented out, e.g.
     <!-- * {{en}} : {{τ|en|XXX}} 
  perhaps because someone commented out a block of several
  lines and the comment closure is somewhere else,
  this code will leave undisturbed translations with the value
  XXX but will modify any others.

  This will not touch {{τ}} when located in other regions
  outside of the translation section (Μεταφράσεις).

  It converts references of {{t}} to {{τ}}.

  It strips leading and trailing blanks from all parameter names
  and values.

  It may reorder the parameters in a template without making
  other changes.  This is normal.

  If a template parameter is passed with no value to {{τ}}, it will
  be removed by this code. 
 
  Text outside of the {{τ}} or {{t}} will not be touched.

Notes about running this script:

  Make sure that the list of all lemmas on all projects is
  located in the same directory and is called 

    all-lemmas-sorted.txt

This script understands various command-line arguments:

    -start:        used as -start:page_name, specifies that the robot should
                   go alphabetically through all pages on the home wiki,
                   starting at the named page.

    -file:         used as -file:file_name, read a list of pages to treat
                   from the named textfile. Page titles should be enclosed
                   in [[double-squared brackets]].

    -ref:          used as -ref:page_name, specifies that the robot should
                   touch all pages referring to the named page.

    -cat:          used as -cat:category_name, specifies that the robot should
                   touch all pages in the named category.

All other parameters will be regarded as a page title; in this case, the bot
will only touch a single page.
"""
import wikipedia, wiktionary, pagegenerators, catlib
import sys
import re

class TranslationBot:

  def __init__(self, generator, acceptall = False):
    self.generator = generator
    self.acceptall = acceptall
    self.transStartTemplateCompiler = re.compile( u"{{(τ|t)\|")
    self.transTemplateCompiler = re.compile(u'({{(?:t|τ)\|[a-zA-Z\-]+\|[^\}]+}})')
    self.commentCompiler = re.compile( u"\<\!\-\-(.*?)\-\-\>" )    
    self.setupLemmaList()

  def setupLemmaList(self):
    # create hash of all lemmas on all projects by language code
    linecompiler = re.compile(u"(.*)")
    self.wordList = {}
    fileAllProjectLemmas =open('all-lemmas-sorted.txt', 'r')
    line1 = fileAllProjectLemmas.readline()
    line = linecompiler.search(line1).group(1)
    count = 0
    print "starting phase 1: preparing lemma list"
    while (line1):
      lemma, language = line.split("||",1)
      if not self.wordList.has_key(language):
          self.wordList[language] = {}
      self.wordList[language][lemma]=1

      line1 = fileAllProjectLemmas.readline()
      line = linecompiler.search(line1).group(1)
      count = count + 1
      if not ( count % 1000000 ):
          print "%s lemmas processed" % count
    print "done phase 1"
    fileAllProjectLemmas.close()

  def processSection(self, sectionText):
    sectionLines = re.split(u'\n',sectionText)
    newSectionLines = []
    for originalLine in sectionLines:
      # if it has a translation not in html comments...
      newLine = self.commentCompiler.sub(u'', originalLine)
      result = self.transStartTemplateCompiler.search(newLine)
      if (result):
        # ... update the translations in the line.
        newLine = self.processLine(originalLine)
        newSectionLines.append(newLine)
      else:
        newSectionLines.append(originalLine)

    newSection = u'\n'.join(newSectionLines)
    return newSection

  def processLine(self, line):
      textChunks = re.split(u'(<!--.*-->)', line)
      newChunks = []
      for chunk in textChunks:
        # comment? don't process it, just add it to the
        # pieces to be strung back together
        if chunk[0:3] == u'<!--':
          newChunks.append(chunk)
        else:
          newChunk = self.processChunk(chunk)
          newChunks.append(newChunk)
      newLine = u''.join(newChunks)
      return(newLine)

  def processChunk(self, chunk):
      translations = re.split(self.transTemplateCompiler, chunk)
      newTranslations = []
      for translation in translations:
        if re.match(self.transTemplateCompiler, translation):
          # this is a translation instead of a piece of 
          # text in between translations, process it.
          newTranslation = self.processTranslation(translation)
          newTranslations.append(newTranslation)
        else:
          newTranslations.append(translation)
      newChunk = u''.join(newTranslations)
      return(newChunk)

  def processTranslation(self, translation):
    # at last, here we are with a translation.

    # μορφή (με παραμέτρους σε τυχαία σειρά):
    #  {{τ|sq|iw=blot|link=blot|noentry=n|nowiki=n}}
    # με ίσως τυχαία κενά που θέλουμε να πετάξουμε

    # get the language code = langArg
    # get the translation itself = translArg
    # if there is anything else in there whatsoever, we will
    # echo it to the user and bail (return the string unchanged)

    # skip the {{τ| or {{t|

    # FIXME if there aren't two }} at the end we are hosed
    args = re.split(u'\|',translation[4:-2])

    if len(args) < 2:
      print "Something's broken: %s, skipping..." % translation
      return translation

    # get the iw, link parameters = iwArg, linkArg
    # skip noentry and nowiki args if present
    langArg = self.unistrip(args[0])
    if not re.match(u'[a-zA-Z\-]+',langArg):
      return translation

    translArg = False
    trArg = False
    iwArg = False
    linkArg = False

    for arg in args[1:]:
      if u'=' in arg:
        ( name, value ) = re.split(u'=',arg)
        name = self.unistrip(name)
        value = self.unistrip(value)
        if name == u"iw":
          iwArg = value
        elif name == u"link":
          linkArg = value
        elif name == u"noentry":
          pass
        elif name == u"nowiki":
          pass
        elif name == u"tr":
          trArg = value
        else:
          print "Something's broken: %s, skipping..." % translation
          return translation
      else:
        if translArg:
          # problem, this is the second unnamed param and we already
          # have a lang code
          print "Something's broken: %s, skipping..." % translation
          return translation
        else:
          translArg = self.unistrip(arg)

    # now check if we have entries, etc.
    # no translation actually in the template, whine and give up
    nowikiArg = False
    noentryArg = False

    if not translArg:
      print "Something's broken: %s, skipping..." % translation
      return translation

    if not self.wordList.has_key(langArg):
      #print u"%s not a wikt"%langArg
      nowikiArg = 1
    elif iwArg:
      tosearch = u'%s'%iwArg
      tosearch = tosearch.encode('utf-8')
      if not self.wordList[langArg].has_key(tosearch):
        #print u"%s not in %s.wikt" % (langArg, tosearch)
        noentryArg = 1
    elif translArg == u'XXX' or translArg == u'ΧΧΧ':
      # not actually a translation... prolly a half-commented-out
      # translation or some such, skip
      return translation
    else:
      tosearch = u'%s'%translArg
      tosearch = tosearch.encode('utf-8')
      if not self.wordList[langArg].has_key(tosearch):
        noentryArg = 1

    # reassemble the whole thing, clear out any extra spaces
    text = u"{{τ|%s|" % langArg
    text = text + u"%s"%translArg
    if nowikiArg:
      text = text + u'|nowiki=1'
    if noentryArg:
      text = text + u'|noentry=1'
    if iwArg:
      text = text + u"|iw=%s"%iwArg
    if linkArg:
      text = text + u"|link=%s"%linkArg
    if trArg:
      text = text + u"|tr=%s"%trArg
    text = text + u"}}"

    # done!
    return text

  def unistrip(self, unicodeString):
    realString = unicodeString.encode('utf-8')
    realString = str.strip(realString)
    return unicode(realString,'utf-8')

  def run(self):
    startOfTranslationSection = u"μεταφράσεις}}="
    sectionMarker = u'={{'
    for page in self.generator:
      try:
        wikipedia.output(u'Έλεγχος σελίδας: %s' % page.title())
        originalPage = page.get()
        pageSections = re.split(sectionMarker,originalPage)
        newPageSections = []
        for pageSection in pageSections:
          # if it's a translation section
          if pageSection[0:len(startOfTranslationSection)] == startOfTranslationSection:
            pageSection = self.processSection(pageSection)
          newPageSections.append(pageSection)
        newPage = sectionMarker.join(newPageSections)

        # we upload the text
        if (newPage == originalPage):
          wikipedia.output('No changes were necessary in %s' % page.title())
        else:
          wikipedia.output(u'ενημέρωση μεταφράσεων')
          wikipedia.setAction(u'ενημέρωση των μεταφράσεων (παραμέτρων του προτύπου τ)')
          wikipedia.output(u'>>> %s <<<' % page.title())
          wikipedia.showDiff(originalPage, newPage)

          if not self.acceptall:
              choice = wikipedia.inputChoice(u'Θέλετε να δεχτείτε αυτές τις αλλαγές;',  ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
              if choice in ['a', 'A']:
                  self.acceptall = True
          if self.acceptall or choice in ['y', 'Y']:
              print "saving"
              page.put(newPage)
      
      except wikipedia.NoPage:
          print u"Η σελίδα %s δεν υπάρχει;;!!"%page.aslink()
      except wikipedia.IsRedirectPage:
          pass
      except wikipedia.LockedPage:
          pass

def main():
    #page generator
    gen = None
    pageTitle = []
    for arg in wikipedia.handleArgs():
        if arg:
            if arg.startswith('-start:'):
                gen = pagegenerators.AllpagesPageGenerator(arg[7:])
            elif arg.startswith('-ref:'):
                referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.ReferringPageGenerator(referredPage)
            elif arg.startswith('-links:'):
                linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:])
                gen = pagegenerators.LinkedPageGenerator(linkingPage)
            elif arg.startswith('-file:'):
                gen = pagegenerators.TextfilePageGenerator(arg[6:])
            elif arg.startswith('-cat:'):
                cat = catlib.Category(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.CategorizedPageGenerator(cat)
            else:
                pageTitle.append(arg)

    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])
    if not gen:
        wikipedia.showHelp('touch')
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = TranslationBot(preloadingGen)
        bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()