Χρήστης:ArielGlenn/findcateditsbyuser.sh
#!/bin/bash # get all changes to categories (optionally, by a specific user) # for a certain time period # produces: title list, page contents file, file with # title and all lines that have links (you can look for iwikis in this file) # requirements: linux/unix environment, curl, egrep, bash usage() { echo "Usage: $0 startdate endate langcode user" echo "where startdate is latest date from which to get changes" echo "and enddate is the earliest date, in the local timezone." echo "The base date may be specified as either today, or lastrun," echo "where lastrun is the lastest date you got changes from" echo "during the previous run." echo echo "For example:" echo "$0 today today-3d en" echo "$0 today-1h today-5h en" echo "$0 today lastrun en" echo "If you omit the d or h the increment is interpreted as days" echo echo "Alternatively you can specify absolute timestamps." echo "These must be in the format yyyy-mm-ddThh:mm:ssZ" echo "For example:" echo "$0 2008-02-06T08:54:06Z 2008-01-23T08:00:00Z en" echo "In this case the times are interpreted as UTC times." echo echo "The third argument is the language code of the wiktionary" echo "you want to check." echo echo "The last argument, user, is the username of the user for which" echo "you want to download changes. If you specify no user, all changes will" echo "be downloaded." echo "For example:" echo "$0 today today-3d el ArielGlenn" exit 1 } if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then usage fi lc="$3" if [ ! -z "$4" ]; then user="$4" else user="" fi user=`echo $user | sed -e 's/ /%20/g;'` usage_lastrun() { echo "In order to use lastrun+-(d|h), you need to have the timestamp of the last run" echo "stored in the file $lastrun in the current directory. To get the appropriate" echo "timestamp, run" echo 'date +%s -d "yyyy-mm-dd hh:mm:ss +0000" > $lastrun' echo "Then run this script again." exit 1 } checkformat() { local d d="$1" if [ -z "$d" ]; then secs=`date +%s` return $secs fi hasZ=`echo $1 | grep Z` if [ ! -z "$hasZ" ]; then # μορφή ως: 2008-01-23T08:00:00Z # μετατροπή σε: 2008-01-23 08:00:00 +0000 reformatted=`echo $1 | sed -e 's/T/ /; s/Z/ +0000/;'` secs=`date --date="$reformatted" +%s` return $secs fi minus=`echo "$d" | grep -e '-'` plus=`echo "$d" | grep -e '+'` if [ ! -z "$minus" ]; then op="-" elif [ ! -z "$plus" ]; then op="+" else op="" fi if [ -z "$op" ]; then basedate=$d incr=0 incrtype="d" else basedate=`echo $d | awk -F"$op" '{ print $1 }'` incr=`echo $d | awk -F"$op" '{ print $2 }'` incrtype="d" fi if [ ! -z "$incr" ]; then day=`echo "$incr" | grep 'd'` hour=`echo "$incr" | grep 'h'` if [ ! -z "$day" ]; then incrtype="d" elif [ ! -z "$hour" ]; then incrtype="h" fi incr=`echo $incr | sed -e "s/$incrtype//"` if [ -z "$incr" ]; then incr='0' fi fi case $basedate in 'today') today=`date -u +"%Y-%m-%d %H:%M:%S +0000"` secs=`date +%s -d "$today"` ;; 'lastrun') if [ ! -e 'last_run' ]; then usage_lastrun exit 1 fi lastdaterun=`cat last_run` testdate=`date -d @"$lastdaterun"` if [ $? -ne 0 ]; then usage_lastrun fi secs=`date +%s -d @"$lastdaterun"` ;; *) usage ;; esac case $incrtype in 'd') incr=$(( $incr*86400 )) ;; 'h') incr=$(( $incr*3600 )) ;; *) ;; esac case $op in '-') secs=$(( $secs-$incr )) ;; '+') secs=$(( $secs+$incr )) ;; '') ;; *) usage esac return 0 } tmp="./tmp_user" checkformat "$1" startdatesecs=$secs checkformat "$2" enddatesecs=$secs ext=`date +%m-%d-%Y -d @$startdatesecs` ext=${ext}-${lc} globstartdate=`date -u -d @$startdatesecs +"%Y-%m-%dT%H:%M:%SZ"` globenddate=`date -u -d @$enddatesecs +"%Y-%m-%dT%H:%M:%SZ"` lastdaterun="$startdatesecs" me=`basename $0` mkdir -p $tmp changes="$tmp/changes.$ext" pages="$tmp/pages.$ext" titles="$tmp/titles.$ext" rm -f $titles.* $pages.* $changes.* # πρόσφατες αλλαγές rcstartdate=$globstartdate rcenddate=$globenddate while [ 1 ]; do echo getting recent changes $rcstartdate to $rcenddate # παίρνουμε τις επόμενες γραμμές από την καταγραφή πρόσφατων αλλαγών if [ ! -z "$user" ]; then curl --retry 10 -H "Expect:" -f "http://$lc.wiktionary.org/w/api.php?action=query&list=usercontribs&uclimit=500&&format=xml&ucstart=$rcstartdate&ucend=$rcenddate&ucnamespace=14&ucprop=timestamp|title|comment&ucuser=$user" > $changes.raw else curl --retry 10 -H "Expect:" -f "http://$lc.wiktionary.org/w/api.php?action=query&list=recentchanges&rclimit=500&rctype=new|edit&format=xml&rcstart=$rcstartdate&rcend=$rcenddate&rcnamespace=14&rcprop=timestamp|title|comment|user" > $changes.raw fi if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get recent changes, bailing" exit 1 fi if [ -e "$changes.cmp" ]; then aredone=`cmp $changes.raw $changes.cmp` if [ -z "$aredone" ]; then break; fi fi cp $changes.raw $changes.cmp cat $changes.raw >> $changes.raw.save # παίρνουμε τους τίτλους if [ ! -z "$user" ]; then # want title|user cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<item user=' | awk -F\" '{ print $6 }' >> $titles.txt # παίρνουμε τη χρονοσφραγίδα από την τελευταία γραμμή nextstartdate=`cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<item user=' | awk -F\" '{ print $10 }' | tail -n 1` else cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $6 "|" $8 }' >> $titles.txt # παίρνουμε τη χρονοσφραγίδα από την τελευταία γραμμή nextstartdate=`cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $10 }' | tail -n 1` fi # αν είναι κενό... τελειώσαμε if [[ -z "$nextstartdate" ]]; then break fi rcstartdate="$nextstartdate" sleep 6 done cp $titles.txt "titles.$ext.txt" #export mv $titles.txt $titles.txt-temp cat $titles.txt-temp | awk -F'|' '{ print $1 }' > $titles.txt total=`wc -l $titles.txt | awk '{ print $1 }'` if [ "$total" == "0" ]; then echo echo ">>> No titles for language code $lc. Done!" echo exit 0 fi count=1 while [ 1 ]; do echo getting pages $count to $count+500 # επόμενες 500 tail -n +$count $titles.txt | head -n 500 > $titles.500.txt left=`cat $titles.500.txt | wc -l` if [ $left == "0" ]; then break; fi count=$(( $count+500 )) curl --retry 10 -H "Expect:" -f -F "curonly=1" -F "wpDownload=1" -F "pages=<$titles.500.txt" "http://$lc.wiktionary.org/w/index.php?title=Special:Export&action=submit" > $pages.xml-temp if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get xml pages, bailing" exit 1 fi if [ -e "$pages.xml" ]; then mv $pages.xml $pages.xml-old fi # put it in front of the older batch, and back into the same filename # (so most recent revs are at the beginning) if [ -e "$pages.xml-old" ]; then cat $pages.xml-temp $pages.xml-old > $pages.xml else cat $pages.xml-temp > $pages.xml fi sleep 6 done cp $pages.xml "pages.$ext.xml" egrep '(title|\[\[)' "pages.$ext.xml" > to-examine.$ext.txt echo "$lastdaterun" > last_run # done! echo "Titles of changed/new articles are now in titles.$ext.txt." echo "Export file is pages.$ext.xml." echo "Text to examine is to-examine.$ext.txt. Done!" exit 0
---
#!/bin/bash for i in aa ab af ak als am an ang ar as ast av ay az ba be bg bh bi bm bn bo br bs ca ch chr co cr cs csb cy da; do ./findcatvandal.sh today today-7 "$i" done for i in de dv dz el en eo es et eu fa fi fj fo fr fy ga gd gl gn gu gv ha he hi hr hsb hu hy ia id ie ik io is it; do ./findcatvandal.sh today today-7 "$i" done for i in iu ja jbo jv ka kk kl km kn ko ks ku kw ky la lb li ln lo lt lv mg mh mi mk ml mn mo mr ms mt my na nah; do ./findcatvandal.sh today today-7 "$i" done for i in nds ne nl nn no oc om or pa pi pl ps pt qu rm rn ro roa-rup ru rw sa sc scn sd sg sh si simple sk sl sm; do ./findcatvandal.sh today today-7 "$i" done for i in sn so sq sr ss st su sv sw ta te tg th ti tk tl tn to tpi tr ts tt tw ug uk ur; do ./findcatvandal.sh today today-7 "$i" done for i in uz vi vo wa wo xh yi yo za zh-min-nan zh zu; do ./findcatvandal.sh today today-7 "$i" done