Αυτό το πρόγραμμα εξετάζει ένα αρχείο xml με λήμματα του Βικιλεξικού, παίρνει τα αποθηκευμένα interwikis και εξετάζει αν χρειάζονται αλλαγές.

interwiki_dump.py

επεξεργασία
#!/usr/bin/python
# -*- coding: utf-8  -*-

import wikipedia, wiktionary, pagegenerators, catlib
import sys
import re, codecs
reload(sys)
sys.setdefaultencoding('utf8')

#pattern = re.compile(u'\[\[([a-z\-]+)\:([^\]]+)\]\]')

class job_iws:
  def __init__(self, site, title):
    self.site = site
    self.title = title

  def treat_iws(self, keimeno):
        marker = '@@'
        while marker in keimeno:
            marker += '@'

        site = self.site
	try:	
         interwiki = wikipedia.getLanguageLinks(keimeno, insite = site)

         # αφαιρούμε τα interwikis
         textnoiws = wikipedia.removeLanguageLinks(keimeno.replace(marker,'').strip(), site = self.site) + site.family.category_text_separator

	 # βρίσκουμε τα σκάρτα interwikis
	 for key in interwiki.keys():
		if interwiki[key].title() != self.title:
			#print "invalid iw", key, interwiki[key]
			del interwiki[key]

	 # ψαχνουμε το all-lemmas .... μέσω του wordlist
	 myiws = {}
	 # αυτό ήταν το νέο dict που θα περιέχει τα interwikis

	 for key in wordList.keys():
	   if (key != "el"):
		tosearch = u'%s\n'%self.title
		tosearch = tosearch.encode('utf-8')
		if wordList[key].has_key(tosearch):
			key1 = wikipedia.Site(key)
			myiws[key1] = wikipedia.Page(key1, self.title)

	 for key in interwiki.keys():
	  if not myiws.has_key(key):
		#print "problem in page %s - missing %s interwiki in new links" % (self.title, key)
		if wikipedia.Page(key, self.title).exists():
			myiws[key] = wikipedia.Page(key, self.title)
	 if (interwiki == myiws):
		return keimeno
	 newiws = wikipedia.replaceLanguageLinks(textnoiws, myiws, site = self.site)
	 #print newiws
         return newiws
	except wikipedia.NoSuchSite:
	 return keimeno


def main():
	#page_tag = re.compile('<page>')
	pageEnd = re.compile('^([^<]*)</text')
	title_tag = re.compile('<title>')
	title_content = re.compile('<title>([^:]+):(.+)<\/title>')
	title_capture = re.compile('<title>(.*)<\/title>')
	text_start = re.compile('<text xml:space="preserve"([^>]*)>(.*)$')
	redir = re.compile('<redirect />')

	fin = codecs.open('../getrc/tmp2/el-june2011_3', 'r', 'utf-8')
	eof=0
	arxiko=""
	namespace=""
	startcopying=0
	while not eof:
	    line = fin.readline()
	    if line == "":
	        eof = 1
#	    elif page_tag.search(line):
#		namespace=""
#		PageTitle=""
#		arxiko=""
#		startcopying=0
	    elif title_tag.search(line):
	        result = title_content.search(line)
	        if result:
		   namespace=result.group(1)
		   PageTitle=result.group(2)
		else:
	           result = title_capture.search(line)
	           if result:
		      namespace="mainsp"
		      PageTitle=result.group(1)
		      arxiko=""
		      startcopying=0
	    elif redir.search(line):
		namespace="redirect"
	    elif text_start.search(line) and (namespace == "mainsp"):
		startcopying=1
		arxiko = arxiko + text_start.search(line).group(2)
	    elif not pageEnd.search(line) and (startcopying==1):
		arxiko = arxiko + line
	    elif pageEnd.search(line):
		arxiko = arxiko + pageEnd.search(line).group(1)
		startcopying=0
		if (namespace == "mainsp"):
			page = wikipedia.Page(wikipedia.getSite(), PageTitle)
			class_job_iws = job_iws(page.site(), page.title())

			teliko = class_job_iws.treat_iws(arxiko)
			if (teliko != arxiko):
			        print "[[%s]]" % (PageTitle)
		                #wikipedia.showDiff(arxiko, teliko)

	fin.close()


wordList = {}
fileAllProjectLemmas =open('./all-lemmas-all-projects.txt', 'r')
line = fileAllProjectLemmas.readline()
count = 0
#print "starting phase 1: preparing lemma list"
while (line):
      language, lemma = line.split(":",1)
      if not wordList.has_key(language):
          wordList[language] = {}
      wordList[language][lemma] = 1
      line = fileAllProjectLemmas.readline()
#      count = count + 1
#      if not ( count % 1000000 ):
#          print "%s lemmas processed" % count
#print "done phase 1"
fileAllProjectLemmas.close()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()