Χρήστης:AtouBot/getrcs.sh
Δείτε επίσης: Χρήστης:AtouBot/getrcs.sh/docs
getrcs.sh
επεξεργασία#!/bin/bash
# TODO
#
# check for available disk space at beginning of run
# of any of the three parts. We will need at least size($lastfull) bytes
# plus (finding out in a minute)
usage() {
echo "Usage: $0 startdate endate [configfile]"
echo "where startdate is latest date from which to get changes"
echo "and enddate is the earliest date, in the local timezone."
echo "The base date may be specified as either today, or lastrun,"
echo "where lastrun is the latest date you got changes from"
echo "during the previous run."
echo
echo "For example:"
echo "$0 today today-3d"
echo "$0 today-1h today-5h"
echo "$0 today lastrun"
echo "If you omit the d or h the increment is interpreted as days"
echo
echo "Alternatively you can specify absolute timestamps."
echo "These must be in the format yyyy-mm-ddThh:mm:ssZ"
echo "For example:"
echo "$0 2008-02-06T08:54:06Z 2008-01-23T08:00:00Z"
echo "In this case the times are interpreted as UTC times."
echo
echo "The optional configfile argument tells the script to use"
echo "the config file you specify instead of the default config.txt"
exit 1
}
if [ -z "$1" ] || [ -z "$2" ]; then
usage
fi
if [ ! -z "$3" ]; then
if [ -e "$3" ]; then
source "$3"
else
echo "Specified config file $3 does not exist."
usage
fi
else
source ./config.txt
fi
if [ ! -e "./sort.pl" ] || [ ! -e "./uniq.pl" ]; then
echo "You are missing one or both of the files sort.pl or uniq.pl which should be in the same directory"
echo "from which you run this command. Please put them in place and run this again."
exit 1
fi
usage_lastrun() {
echo "In order to use lastrun+-(d|h), you need to have the timestamp of the last run"
echo "stored in the file $lastrun in the current directory. To get the appropriate"
echo "timestamp, run"
echo 'date +%s -d "yyyy-mm-dd hh:mm:ss +0000" > $lastrun'
echo "Then run this script again."
exit 1
}
checkformat() {
local d
d="$1"
if [ -z "$d" ]; then
secs=`date +%s`
return $secs
fi
hasZ=`echo $1 | grep Z`
if [ ! -z "$hasZ" ]; then
# μορφή ως: 2008-01-23T08:00:00Z
# μετατροπή σε: 2008-01-23 08:00:00 +0000
reformatted=`echo $1 | sed -e 's/T/ /; s/Z/ +0000/;'`
secs=`date --date="$reformatted" +%s`
return $secs
fi
minus=`echo "$d" | grep -e '-'`
plus=`echo "$d" | grep -e '+'`
if [ ! -z "$minus" ]; then
op="-"
elif [ ! -z "$plus" ]; then
op="+"
else
op=""
fi
if [ -z "$op" ]; then
basedate=$d
incr=0
incrtype="d"
else
basedate=`echo $d | awk -F"$op" '{ print $1 }'`
incr=`echo $d | awk -F"$op" '{ print $2 }'`
incrtype="d"
fi
if [ ! -z "$incr" ]; then
day=`echo "$incr" | grep 'd'`
hour=`echo "$incr" | grep 'h'`
if [ ! -z "$day" ]; then
incrtype="d"
elif [ ! -z "$hour" ]; then
incrtype="h"
fi
incr=`echo $incr | sed -e "s/$incrtype//"`
if [ -z "$incr" ]; then
incr='0'
fi
fi
case $basedate in
'today')
today=`date -u +"%Y-%m-%d %H:%M:%S +0000"`
secs=`date +%s -d "$today"`
;;
'lastrun')
if [ ! -e "$lastrun" ]; then
usage_lastrun
exit 1
fi
lastdaterun=`cat $lastrun`
testdate=`date -d @"$lastdaterun"`
if [ $? -ne 0 ]; then
usage_lastrun
fi
secs=`date +%s -d @"$lastdaterun"`
;;
*)
usage
;;
esac
case $incrtype in
'd')
incr=$(( $incr*86400 ))
;;
'h')
incr=$(( $incr*3600 ))
;;
*)
;;
esac
case $op in
'-')
secs=$(( $secs-$incr ))
;;
'+')
secs=$(( $secs+$incr ))
;;
'')
;;
*)
usage
esac
return 0
}
checkformat "$1"
startdatesecs=$secs
checkformat "$2"
enddatesecs=$secs
ext=`date +%m-%d-%Y -d @$startdatesecs`
globstartdate=`date -u -d @$startdatesecs +"%Y-%m-%dT%H:%M:%SZ"`
globenddate=`date -u -d @$enddatesecs +"%Y-%m-%dT%H:%M:%SZ"`
lastdaterun="$startdatesecs"
me=`basename $0`
mkdir -p $tmp
changes="$tmp/changes.$ext"
moves="$tmp/moves.$ext"
imports="$tmp/imports.$ext"
uploads="$tmp/uploads.$ext"
deletes="$tmp/deletes.$ext"
pages="$tmp/pages.$ext"
titles="$tmp/titles.$ext"
full="$tmp/full.$ext"
lastfull="$snapshotdir/$snapshot"
case $me in
'getchanges.sh')
do="changes"
;;
'getmoves.sh')
do="moves"
;;
'getimports.sh')
do="imports"
;;
'getuploads.sh')
do="uploads"
;;
'getdeletes.sh')
do="deletes"
;;
'getpages.sh')
do="pages"
;;
'domerges.sh')
do="merges"
;;
*)
rm -f $titles.* $pages.* $deletes.* $moves.* $imports.* $uploads.* $changes.*
do="all"
;;
esac
if [ "$do" != "all" ]; then
rm -f "$tmp/$do".*
fi
# πρόσφατες αλλαγές
url1a="https://${wiki}/w/api.php?action=query&list=recentchanges&continue=&rclimit=500&rctype=new|edit&format=xml&rcstart=${globstartdate}&rcend=${globenddate}"
url2a="https://${wiki}/w/api.php?action=query&list=recentchanges&continue=&rclimit=500&rctype=new|edit&format=xml&rcstart=${globstartdate}&rcend=${globenddate}&rcnamespace=0"
url1=$url1a
url2=$url2a
echo "getting recent changes from $globstartdate to $globenddate"
while [ 1 ]; do
if [ "$do" != "changes" ] && [ "$do" != "all" ]; then
break;
fi
# παίρνουμε τις επόμενες γραμμές από την καταγραφή πρόσφατων αλλαγών
case "$snapshottype" in
"fullwithusers" | "titleswithusers" )
curl --retry 10 -H "Expect:" -f $url1 > $changes.raw
;;
"full" | "titles")
curl --retry 10 -H "Expect:" -f $url2 > $changes.raw
;;
*)
echo "Unknown snapshot type. Please check your configuration file and"
echo "run this step again."
exit 1
esac
if [ $? -ne 0 ]; then
echo "Error $? from curl, unable to get recent changes, bailing"
exit 1
fi
#if [ -e "$changes.cmp" ]; then
# aredone=`cmp $changes.raw $changes.cmp`
# if [ -z "$aredone" ]; then
# break;
# fi
#fi
#cp $changes.raw $changes.cmp
cat $changes.raw >> $changes.raw.save
# παίρνουμε τους τίτλους
case "$snapshottype" in
"fullwithusers" | "full")
cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $6 }' >> $titles.txt
;;
"titleswithusers" | "titles" | *)
cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $16 " " $6 }' >> $titles.txt
;;
esac
# παίρνουμε τη χρονοσφραγίδα από την γραμμή rccontinue
nextstartdate=`cat $changes.raw | sed -e 's/>/>\n/g;' | grep 'rccontinue=' | awk -F\" '{ print $2 }'`
# αν είναι κενό... τελειώσαμε
if [[ -z "$nextstartdate" ]]; then
break
fi
url1="${url1a}&rccontinue=${nextstartdate}"
url2="${url2a}&rccontinue=${nextstartdate}"
sleep $logsecs
done
#moves
url1a="https://${wiki}/w/api.php?action=query&list=logevents&letype=move&continue=&lelimit=500&format=xml&lestart=${globstartdate}&leend=${globenddate}"
url1=$url1a
echo "getting moves from $globstartdate to $globenddate"
while [ 1 ]; do
if [ "$do" != "moves" ] && [ "$do" != "all" ]; then
break;
fi
# παίρνουμε τις επόμενες γραμμές από την καταγραφή μετακινήσεων
curl --retry 10 -H "Expect:" -f $url1 > $moves.raw
if [ $? -ne 0 ]; then
echo "Error $? from curl, unable to get moves, bailing"
exit 1
fi
#if [ -e "$moves.cmp" ]; then
# aredone=`cmp $moves.raw $moves.cmp`
# if [ -z "$aredone" ]; then
# break;
# fi
#fi
#cp $moves.raw $moves.cmp
cat $moves.raw >> $moves.raw.save
# παίρνουμε τους τίτλους
# μόνο για το full έχουμε ελέγξει τα αποτελέσματα, τα υπόλοιπα δεν έχουν δοκιμαστεί
case "$snapshottype" in
"fullwithusers")
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }' >> $titles.txt
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $24 }' >> $titles.txt
;;
"full")
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $24 }' | grep -v ':' >> $titles.txt
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | grep -v 'suppressredirect' | awk -F\" '{ print $6 }' | grep -v ':' >> $titles.txt
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | grep 'suppressredirect' | grep 'ns="0"' | awk -F\" '{ print $18 " " $6 }' >> $deletes.xml
;;
"titleswithusers")
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' >> $titles.txt
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $22 }' >> $titles.txt
;;
"titles")
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' | grep -v ':' >> $titles.txt
cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $22 }' | grep -v ':' >> $titles.txt
;;
*)
;;
esac
# παίρνουμε τη χρονοσφραγίδα
nextstartdate=`cat $moves.raw | sed -e 's/>/>\n/g;' | grep 'lecontinue=' | awk -F\" '{ print $2 }'`
# αν είναι κενό... τελειώσαμε
if [[ -z "$nextstartdate" ]]; then
break
fi
url1="${url1a}&lecontinue=${nextstartdate}"
sleep $logsecs
done
# εισαγωγές
url1a="https://${wiki}/w/api.php?action=query&list=logevents&letype=import&continue=&lelimit=500&format=xml&lestart=${globstartdate}&leend=${globenddate}"
url1=$url1a
echo "getting moves from $globstartdate to $globenddate"
while [ 1 ]; do
if [ "$do" != "imports" ] && [ "$do" != "all" ]; then
break;
fi
# παίρνουμε τις επόμενες γραμμές από την καταγραφή εισαγωγών
curl --retry 10 -H "Expect:" -f $url1 > $imports.raw
if [ $? -ne 0 ]; then
echo "Error $? from curl, unable to get imports, bailing"
exit 1
fi
#if [ -e "$imports.cmp" ]; then
# aredone=`cmp $imports.raw $imports.cmp`
# if [ -z "$aredone" ]; then
# break;
# fi
#fi
#cp $imports.raw $imports.cmp
cat $imports.raw >> $imports.raw.save
# παίρνουμε τους τίτλους
case "$snapshottype" in
"fullwithusers")
cat $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }' >> $titles.txt
;;
"full")
cat $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }' | grep -v ':' >> $titles.txt
;;
"titleswithusers")
cat $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $18 " " $6 }' >> $titles.txt
;;
"titles")
cat $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $18 " " $6 }' | grep -v ':' >> $titles.txt
;;
*)
;;
esac
# παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή
nextstartdate=`cat $imports.raw | sed -e 's/>/>\n/g;' | grep 'lecontinue=' | awk -F\" '{ print $2 }'`
# αν είναι κενό... τελειώσαμε
if [[ -z "$nextstartdate" ]]; then
break
fi
url1="${url1a}&lecontinue=${nextstartdate}"
sleep $logsecs
done
# επιφορτώσεις
url1a="https://${wiki}/w/api.php?action=query&list=logevents&letype=upload&continue=&lelimit=500&format=xml&lestart=${globstartdate}&leend=${globenddate}"
url1=$url1a
echo getting uploads
while [ 1 ]; do
if [ "$do" != "uploads" ] && [ "$do" != "all" ]; then
break;
fi
# παίρνουμε τις επόμενες γραμμές από την καταγραφή επιφορτώσεων
curl --retry 10 -H "Expect:" -f $url1 > $uploads.raw
if [ $? -ne 0 ]; then
echo "Error $? from curl, unable to get uploads, bailing"
exit 1
fi
#if [ -e "$uploads.cmp" ]; then
# aredone=`cmp $uploads.raw $uploads.cmp`
# if [ -z "$aredone" ]; then
# break;
# fi
#fi
#cp $uploads.raw $uploads.cmp
cat $uploads.raw >> $uploads.raw.save
# παίρνουμε τους τίτλους
case "$snapshottype" in
"fullwithusers")
cat $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }' >> $titles.txt
;;
"full")
cat $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }'| grep -v ':' >> $titles.txt
;;
"titleswithusers")
cat $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $18 " " $6 }' >> $titles.txt
;;
"titles")
cat $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $18 " " $6 }'| grep -v ':' >> $titles.txt
;;
*)
;;
esac
# παίρνουμε τη χρονοσφραγίδα
nextstartdate=`cat $uploads.raw | sed -e 's/>/>\n/g;' | grep 'lecontinue=' | awk -F\" '{ print $2 }'`
# αν είναι κενό... τελειώσαμε
if [[ -z "$nextstartdate" ]]; then
break
fi
url1="${url1a}&lecontinue=${nextstartdate}"
sleep $logsecs
done
if [ "$snapshottype" == "fullwithusers" ] || [ "$snapshottype" == "full" ]; then
# σελίδες με τους ορισμένους τίτλους
mv $titles.txt $titles.txt-temp
cat $titles.txt-temp | ./sort.pl | ./uniq.pl > $titles.txt
count=1
while [ 1 ]; do
if [ "$do" != "pages" ] && [ "$do" != "all" ]; then
break;
fi
echo getting pages $count to $count+500
# επόμενες 500
tail -n +$count $titles.txt | head -n 500 > $titles.500.txt
left=`cat $titles.500.txt | wc -l`
if [ $left == "0" ]; then
break;
fi
count=$(( $count+500 ))
curl --retry 10 -H "Expect:" -f -F "curonly=1" -F "wpDownload=1" -F "pages=<$titles.500.txt" "https://$wiki/w/index.php?title=Special:Export&action=submit" > $pages.xml-temp
if [ $? -ne 0 ]; then
echo "Error $? from curl, unable to get xml pages, bailing"
exit 1
fi
if [ -e "$pages.xml" ]; then
mv $pages.xml $pages.xml-old
fi
# put it in front of the older batch, and back into the same filename
# (so most recent revs are at the beginning)
if [ -e "$pages.xml-old" ]; then
cat $pages.xml-temp $pages.xml-old > $pages.xml
else
cat $pages.xml-temp > $pages.xml
fi
sleep $pagesecs
done
fi
# διαγραφές
url1a="https://${wiki}/w/api.php?action=query&list=logevents&letype=delete&continue=&lelimit=500&format=xml&lestart=${globstartdate}&leend=${globenddate}"
url1=$url1a
echo getting deletes
while [ 1 ]; do
if [ "$do" != "deletes" ] && [ "$do" != "all" ]; then
break;
fi
# get next lines from delete log
curl --retry 10 -H "Expect:" -f $url1 > $deletes.raw
if [ $? -ne 0 ]; then
echo "Error $? from curl, unable to get deletes, bailing"
exit 1
fi
#if [ -e "$deletes.cmp" ]; then
# aredone=`cmp $deletes.raw $deletes.cmp`
# if [ -z "$aredone" ]; then
# break;
# fi
#fi
#cp $deletes.raw $deletes.cmp
cat $deletes.raw >> $deletes.raw.save
# create new batch of timestamp, title for each delete record
# we don't bother to filter these based on snapshot type
cat $deletes.raw | sed -e 's/>/>\n/g;' | grep '<item logid' | grep -v 'action="revision"' | awk -F\" '{ print $18 " " $6 }' | sed "s/'/'/g" >> $deletes.xml
# παίρνουμε τη χρονοσφραγίδα από την τελευταία γραμμή
nextstartdate=`cat $deletes.raw | sed -e 's/>/>\n/g;' | grep 'lecontinue=' | awk -F\" '{ print $2 }'`
# αν είναι κενό... τελειώσαμε
if [[ -z "$nextstartdate" ]]; then
break
fi
url1="${url1a}&lecontinue=${nextstartdate}"
sleep $logsecs
done
if [ "$do" != "merges" ] && [ "$do" != "all" ]; then
echo "done!"
exit 0;
fi
# merges of new pages, changed pages, and deletes
if [ ! -e ./merge-pages-main-and-export.pl ] || [ ! -e ./merge-deletes.pl ]; then
echo "One or more of the required scripts for this file are missing:"
echo "merge-pages-main-and-export.pl or merge-deletes.pl."
echo "Please make sure that they are all in the directory from where you are giving the"
echo "command $0. "
exit 1
fi
if [ "$snapshottype" == "titleswithusers" ] || [ "$snapshottype" == "titles" ]; then
if [ ! -e "$lastfull" ] && [ ! -e "$lastfull.bz2" ]; then
echo "$lastfull{.bz2} does not exist. Please copy your last full incremental into this file"
echo "and run this script again as domerges.sh $1 $2 $3 in order to finish this last step."
echo "You can either compress it as a bz2 file or leave it uncompressed."
exit 1
fi
if [ -e "$lastfull.bz2" ]; then
compressed="true"
fi
# full xml files have this in their first line
if [ -z "$compressed" ]; then
isxml=`head -1 $lastfull | egrep '<page|<mediawiki'`
else
isxml=`bzcat $lastfull.bz2 | head -1 | egrep '<page|<mediawiki'`
fi
if [ ! -z "$isxml" ]; then
echo "generating titles from standard xml file..."
# we must get the ts and title from the xml file and stuff it somewhere.
if [ -z "$compressed" ]; then
cat "$lastfull" | ./full2titles.pl > "$lastfull.titles"
mv "$lastfull" "$lastfull.sav"
else
bzcat "$lastfull" | ./full2titles.pl > "$lastfull.titles"
mv "$lastfull.bz2" "$lastfull.bz2.sav"
fi
mv "$lastfull.titles" "$lastfull"
compressed=""
fi
# lose those pages from the full xml dump if the pages in full are older. then cat the rest
# (from the exports that are newer) on the end.
echo rewriting full titles list
if [ -z "$compressed" ]; then
cat "$lastfull" | ./merge-pages-main-and-export-titles.pl $titles.txt > $full-titles.xml-temp
else
cat "$lastfull.bz2" | ./merge-pages-main-and-export-titles.pl $titles.txt > $full-titles.xml-temp
fi
# process the deletes
echo processing deletes
cat $full-titles.xml-temp | ./merge-deletes-titles.pl $deletes.xml > $full-titles.xml
echo copying files into place
# set up new full to be the next file we use
cp $full-titles.xml $lastfull
# don't do this til the end, in case of failure
echo "$lastdaterun" > "$lastrun"
cp $full-titles.xml full-titles.$ext.xml
echo new full titles list is now in place at full-titles.$ext.xml and $lastfull
else
# lose those pages from the full xml dump if the pages in full are older. then cat the rest
# (from the exports that are newer) on the end.
echo rewriting full dump
if [ -e "$lastfull" ]; then
cat "$lastfull" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 > $full.xml.bz2-temp
elif [ -e "$lastfull.bz2" ]; then
bzcat "$lastfull.bz2" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 > $full.xml.bz2-temp
else
echo ""$lastfull{.bz2}" does not exist. Please copy your last full incremental into this file"
echo "and run this script again as domerges.sh $1 $2 $3 in order to finish this last step."
echo "You can copy the uncompressed xml file or you can copy it as a bz2 file."
exit 1
fi
# process the deletes
echo processing deletes
bzcat $full.xml.bz2-temp | ./merge-deletes.pl $deletes.xml | bzip2 > $full.xml.bz2
echo copying files into place
# set up new full to be the next file we use
cp $full.xml.bz2 $lastfull.bz2
# don't do this til the end, in case of failure
echo "$lastdaterun" > last_run
cp $full.xml.bz2 full.$ext.xml.bz2
echo new full is now in place at full.$ext.xml.bz2 and $lastfull.bz2
fi
# done!
echo "done!"
exit 0
sort.pl
επεξεργασία#!/usr/bin/perl
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
use encoding(UTF8);
use utf8;
print foreach sort <STDIN>;
uniq.pl
επεξεργασία#!/usr/bin/perl
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
use encoding(UTF8);
use utf8;
$prevline="";
while (<STDIN>) {
if ($_ ne $prevline) {
print;
$prevline=$_;
}
}
merge-pages-main-and-export.pl
επεξεργασία#!/usr/bin/perl
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
use encoding(UTF8);
use utf8;
# παίρνουμε τους καινούργιους τίτλους
# διαβάζουμε το παλιό αρχείο
# όταν βρούμε σελίδα με τίτλο που δεν είναι στον κατάλογο, την γράφουμε
unless (@ARGV) {
die "Usage: $0 filename-of-exported-pages\n";
}
$filename=shift;
# hash of titles with timestamps from file...
open(FILE,'<',$filename);
binmode(FILE, ":utf8");
while (<FILE>) {
if (/<title>(.*)<\/title>/) {
$temptitle=$1;
}
elsif (/<timestamp>(.*)<\/timestamp>/) {
$temptimestamp=$1;
$titles{$temptitle}=$temptimestamp;
}
}
close(FILE);
sub compareem {
my($ts1,$ts2) = @_;
$ts1 =~ s/[-:TZ]//g ;
$ts2 =~ s/[-:TZ]//g ;
return $ts1 <=> $ts2;
}
$text="";
while (<STDIN>) {
$text.=$_;
if (/<page>/) {
$text = $_;
$title="";
}
elsif (/<title>(.*)<\/title>/) {
$title=$1;
}
elsif (/<timestamp>(.*)<\/timestamp>/) {
$timestamp=$1;
}
elsif (/<\/page>/) {
$result = compareem($titles{$title}, $timestamp);
# compare our timestamp with the one from titles... if ours is later, we write it
if ($result < 0) {
print $text;
if ($titles{$title}) {
$wrote{$title}=1;
}
}
else {
$skipped{$title}=1;
}
}
}
# reopen file, we are going to read the stuff from it and skip the titles that we
# wrote already but write the rest
open(FILE,'<',$filename);
binmode(FILE, ":utf8");
while (<FILE>) {
$text.=$_;
if (/<page>/) {
$text = $_;
$title="";
}
elsif (/<title>(.*)<\/title>/) {
$title=$1;
}
elsif (/<\/page>/) {
# full file had the page, but it was an older copy
if ($skipped{$title} > 0) {
print $text;
}
# full file didn't have the page.
elsif (!$wrote{$title}) {
print $text;
}
# full file had the page and it was newer...
}
}
close(FILE);
merge-deletes.pl
επεξεργασία#!/usr/bin/perl
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
use encoding(UTF8);
use utf8;
# παίρνουμε τις διαγραφές
# διαβάζουμε το παλιό αρχείο
# όταν βρούμε σελίδα με τίτλο που δεν διαγράφτηκε αργότερα, την γράφουμε
unless (@ARGV) {
die "Usage: $0 filename-of-deletions\n";
}
$filename=shift;
# hash of titles with timestamps from file...
open(FILE,'<',$filename) or die("can't open file $filename\n");
binmode(FILE, ":utf8");
while (<FILE>) {
chomp;
($timestamp,$title) = split(/ /,$_,2);
$titles{$title}=$timestamp;
}
close(FILE);
sub compareem {
my($ts1,$ts2) = @_;
$ts1 =~ s/[-:TZ]//g ;
$ts2 =~ s/[-:TZ]//g ;
return $ts1 <=> $ts2;
}
$text="";
while (<STDIN>) {
$text.=$_;
if (/<page>/) {
$text = $_;
$title="";
}
elsif (/<title>(.*)<\/title>/) {
$title=$1;
}
elsif (/<timestamp>(.*)<\/timestamp>/) {
$timestamp=$1;
}
elsif (/<\/page>/) {
$result = compareem($titles{$title}, $timestamp);
# compare our timestamp with the one from titles... if ours is later, we write it
if ($result < 0) {
print $text;
if ($titles{$title}) {
$wrote{$title}=1;
}
}
else {
$skipped{$title}=1;
}
}
}
config.txt
επεξεργασία# configuration file for wiki snapshots # change me to your project #wiki="en.wiktionary.org" wiki="el.wiktionary.org" #change me to name of export page expurl='Special:Export' #expurl='Ειδικό:Export' #change me to type of snapshot #one of: fullwithusers full titles # fullwithusers means current copies of everything # full means current copies of namespace 0 # titleswithusers means just the current titles of everything # titles means just the current titles of namespace 0 snapshottype=fullwithusers #how many seconds to sleep between log requests logsecs=2 #how many seconds to sleep between requests of 500 pages pagesecs=5 #work dir where all intermediate files will live tmp="./tmp" #name of file where snapshot will be stored snapshot="last_full.xml" #directory where snapshot will be stored snapshotdir="." #name of file where we will keep date of last run lastrun="last_run"