#!/bin/bash

# get all changes to categories (optionally, by a specific user)
# for a certain time period

# produces: title list, page contents file, file with 
# title and all lines that have links (you can look for iwikis in this file)

# requirements: linux/unix environment, curl, egrep, bash

usage() {
  echo "Usage: $0 startdate  endate langcode user"
  echo "where startdate is latest date from which to get changes"
  echo "and enddate is the earliest date, in the local timezone."
  echo "The base date may be specified as either today, or lastrun,"
  echo "where lastrun is the lastest date you got changes from"
  echo "during the previous run."
  echo 
  echo "For example:"
  echo "$0 today today-3d en"
  echo "$0 today-1h today-5h en"
  echo "$0 today lastrun en"
  echo "If you omit the d or h the increment is interpreted as days"
  echo
  echo "Alternatively you can specify absolute timestamps."
  echo "These must be in the format yyyy-mm-ddThh:mm:ssZ"
  echo "For example:"
  echo "$0 2008-02-06T08:54:06Z  2008-01-23T08:00:00Z en"
  echo "In this case the times are interpreted as UTC times."
  echo
  echo "The third argument is the language code of the wiktionary"
  echo "you want to check."
  echo
  echo "The last argument, user, is the username of the user for which"
  echo "you want to download changes.  If you specify no user, all changes will"
  echo "be downloaded."
  echo "For example:"
  echo "$0 today today-3d el ArielGlenn"
  exit 1
}

if [ -z "$1"  ] || [ -z "$2" ] || [ -z "$3" ]; then
  usage
fi

lc="$3"

if [ ! -z "$4" ]; then 
  user="$4"
else
  user=""
fi

user=`echo $user | sed -e 's/ /%20/g;'`

usage_lastrun() {
    echo "In order to use lastrun+-(d|h), you need to have the timestamp of the last run"
    echo "stored in the file $lastrun in the current directory.  To get the appropriate"
    echo "timestamp, run"
    echo 'date +%s -d "yyyy-mm-dd hh:mm:ss +0000" > $lastrun' 
    echo "Then run this script again."
    exit 1
}

checkformat() {
    local d

    d="$1"

    if [ -z "$d" ]; then
	secs=`date +%s`
        return $secs
    fi	

    hasZ=`echo $1 | grep Z`
    if [ ! -z "$hasZ" ]; then
       # μορφή ως: 2008-01-23T08:00:00Z
       # μετατροπή σε: 2008-01-23 08:00:00 +0000
       reformatted=`echo $1 | sed -e 's/T/ /; s/Z/ +0000/;'`
       secs=`date --date="$reformatted" +%s`
       return $secs
    fi

    minus=`echo "$d" | grep -e '-'`
    plus=`echo "$d" | grep -e '+'`
    if [ ! -z "$minus" ]; then 
	op="-"
    elif [ ! -z "$plus" ]; then
	op="+"
    else
	op=""
    fi
    if [ -z "$op" ]; then
	basedate=$d
	incr=0
	incrtype="d"
    else
        basedate=`echo $d | awk -F"$op" '{ print $1 }'`
        incr=`echo $d | awk -F"$op" '{ print $2 }'`
	incrtype="d"
    fi
    if [ ! -z "$incr" ]; then
	day=`echo "$incr" | grep 'd'`
	hour=`echo "$incr" | grep 'h'`
	if [ ! -z "$day" ]; then 
	    incrtype="d"
	elif [ ! -z "$hour" ]; then
	    incrtype="h"
	fi
	incr=`echo $incr | sed -e "s/$incrtype//"`
	if [ -z "$incr" ]; then
	    incr='0'
	fi
    fi
    case $basedate in
	'today')
            today=`date -u +"%Y-%m-%d %H:%M:%S +0000"`
            secs=`date +%s -d "$today"`
	    ;;
	'lastrun')
	    if [ ! -e 'last_run' ]; then
		usage_lastrun
		exit 1
	    fi
	    lastdaterun=`cat last_run`
	    testdate=`date -d @"$lastdaterun"`
	    if [ $? -ne 0 ]; then
		usage_lastrun
	    fi
            secs=`date +%s -d @"$lastdaterun"`
	    ;;
	*)
	    usage
	    ;;
    esac
    case $incrtype in
	'd')
	    incr=$(( $incr*86400 ))
	;;
	'h')
	    incr=$(( $incr*3600 ))
	;;
	*)
	;;
    esac
    case $op in 
	'-')
	    secs=$(( $secs-$incr ))
	    ;;
	'+')
	    secs=$(( $secs+$incr ))
	    ;;
	'')
	    ;;
	*)
	    usage
    esac
    return 0
}

tmp="./tmp_user"
checkformat "$1"
startdatesecs=$secs
checkformat "$2"
enddatesecs=$secs

ext=`date +%m-%d-%Y -d @$startdatesecs`
ext=${ext}-${lc}

globstartdate=`date -u -d @$startdatesecs +"%Y-%m-%dT%H:%M:%SZ"`
globenddate=`date -u -d @$enddatesecs +"%Y-%m-%dT%H:%M:%SZ"`

lastdaterun="$startdatesecs"
me=`basename $0`


mkdir -p $tmp
changes="$tmp/changes.$ext"
pages="$tmp/pages.$ext"
titles="$tmp/titles.$ext"

rm -f  $titles.* $pages.* $changes.* 

# πρόσφατες αλλαγές
rcstartdate=$globstartdate
rcenddate=$globenddate

while [ 1 ]; do

  echo getting recent changes $rcstartdate to $rcenddate
  # παίρνουμε τις επόμενες γραμμές από την καταγραφή πρόσφατων αλλαγών
  if [ ! -z "$user" ]; then
      curl --retry 10 -H "Expect:" -f "http://$lc.wiktionary.org/w/api.php?action=query&list=usercontribs&uclimit=500&&format=xml&ucstart=$rcstartdate&ucend=$rcenddate&ucnamespace=14&ucprop=timestamp|title|comment&ucuser=$user"  >  $changes.raw
  else
      curl --retry 10 -H "Expect:" -f "http://$lc.wiktionary.org/w/api.php?action=query&list=recentchanges&rclimit=500&rctype=new|edit&format=xml&rcstart=$rcstartdate&rcend=$rcenddate&rcnamespace=14&rcprop=timestamp|title|comment|user"  >  $changes.raw
  fi

  if [ $? -ne 0 ]; then
      echo "Error $? from curl, unable to get recent changes, bailing"
      exit 1
  fi
  if [ -e "$changes.cmp" ]; then
      aredone=`cmp $changes.raw $changes.cmp`
      if [ -z "$aredone" ]; then
          break;
      fi
  fi
  cp $changes.raw $changes.cmp
  cat $changes.raw >> $changes.raw.save

  # παίρνουμε τους τίτλους
  if [ ! -z "$user" ]; then
# want title|user
      cat  $changes.raw | sed -e 's/>/>\n/g;' | grep '<item user=' | awk -F\" '{ print $6 }'  >> $titles.txt
      # παίρνουμε τη χρονοσφραγίδα από την τελευταία γραμμή
      nextstartdate=`cat $changes.raw |   sed -e 's/>/>\n/g;' | grep '<item user=' | awk -F\" '{ print $10 }' | tail -n 1`
  else
      cat  $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $6 "|" $8 }'  >> $titles.txt
      # παίρνουμε τη χρονοσφραγίδα από την τελευταία γραμμή
      nextstartdate=`cat $changes.raw |   sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $10 }' | tail -n 1`
  fi

  # αν είναι κενό... τελειώσαμε
  if [[ -z "$nextstartdate" ]]; then
    break
  fi

  rcstartdate="$nextstartdate"
  sleep 6
done

cp $titles.txt "titles.$ext.txt"

#export 
mv $titles.txt $titles.txt-temp
cat $titles.txt-temp | awk -F'|' '{ print $1 }'  > $titles.txt

total=`wc -l $titles.txt | awk '{ print $1 }'`
if [ "$total" == "0" ]; then
  echo
  echo ">>> No titles for language code $lc. Done!"
  echo
  exit 0
fi

count=1
while [ 1 ]; do
    echo getting pages $count to $count+500
    # επόμενες 500
    tail -n +$count $titles.txt | head -n 500 > $titles.500.txt
    left=`cat $titles.500.txt | wc -l`
    if [ $left == "0" ]; then
        break;
    fi
    count=$(( $count+500 ))

    curl --retry 10 -H "Expect:" -f -F "curonly=1" -F "wpDownload=1" -F "pages=<$titles.500.txt" "http://$lc.wiktionary.org/w/index.php?title=Special:Export&action=submit" > $pages.xml-temp

    if [ $? -ne 0 ]; then
        echo "Error $? from curl, unable to get xml pages, bailing"
        exit 1
    fi
    if [ -e "$pages.xml" ]; then
        mv $pages.xml $pages.xml-old
    fi
    # put it in front of the older batch, and back into the same filename 
    # (so most recent revs are at the beginning)
    if [ -e "$pages.xml-old" ]; then
        cat $pages.xml-temp $pages.xml-old > $pages.xml
    else
        cat $pages.xml-temp > $pages.xml
    fi
    sleep 6
done

cp $pages.xml "pages.$ext.xml"
egrep '(title|\[\[)' "pages.$ext.xml" > to-examine.$ext.txt

echo "$lastdaterun" > last_run

# done!
echo "Titles of changed/new articles are now in titles.$ext.txt."
echo "Export file is pages.$ext.xml."  
echo "Text to examine is to-examine.$ext.txt. Done!"
exit 0

---

#!/bin/bash
for i in aa ab af ak als am an ang ar as ast av ay az ba be bg bh bi bm bn bo br bs ca ch chr co cr cs csb cy da; do
./findcatvandal.sh today today-7 "$i"
done
for i in de dv dz el en eo es et eu fa fi fj fo fr fy ga gd gl gn gu gv ha he hi hr hsb hu hy ia id ie ik io is it; do
./findcatvandal.sh today today-7 "$i"
done
for i in iu ja jbo jv ka kk kl km kn ko ks ku kw ky la lb li ln lo lt lv mg mh mi mk ml mn mo mr ms mt my na nah; do
./findcatvandal.sh today today-7 "$i"
done
for i in nds ne nl nn no oc om or pa pi pl ps pt qu rm rn ro roa-rup ru rw sa sc scn sd sg sh si simple sk sl sm; do
./findcatvandal.sh today today-7 "$i"
done
for i in sn so sq sr ss st su sv sw ta te tg th ti tk tl tn to tpi tr ts tt tw ug uk ur; do
./findcatvandal.sh today today-7 "$i"
done
for i in  uz vi vo wa wo xh yi yo za zh-min-nan zh zu; do
./findcatvandal.sh today today-7 "$i"
done