#!/bin/bash
usage() {
echo "Usage: $0 namespace"
echo "where namespace is the number of the namespace from which to retrieve titles"
echo
echo "For example:"
echo "$0 0 for the main namespace";
echo "$0 1 for the Talk namespace";
echo "$0 2 for the User namespace";
echo "$0 3 for the User talk namespace";
echo "$0 4 for the Βικιλεξικό namespace";
echo "$0 5 for the Βικιλεξικό talk namespace";
echo "$0 10 for the Template namespace";
echo "$0 11 for the Template talk namespace";
exit 1
}
if [ -z "$1" ]; then
usage
fi
namesp=`echo "$1" | sed -e 's/ /_/g;'`
tmp="./namespace_tmp"
today=`date +"%B-%d-%Y"`
ext="$today"
mkdir -p $tmp
titles="$tmp/titles.$ext"
apfrom=""
step=500
rm -f $titles.*
count=1
while [ 1 ]; do
echo getting namespace $namesp titles $count to $count+$step
# επόμενοι 500 ($step)
echo "$titles.xml.temp"
if [ -z "$apfrom" ]; then
curl --retry 10 -H 'Expect:' -f "http://el.wiktionary.org/w/api.php?action=query&list=allpages&apnamespace=$namesp&aplimit=$step&format=xml" | sed -e 's/>/>\n/g;' > $titles.xml.temp
else
#set -x
curl --retry 10 -H 'Expect:' -f "http://el.wiktionary.org/w/api.php?action=query&list=allpages&apnamespace=$namesp&apfrom=$apfrom&aplimit=$step&format=xml" | sed -e 's/>/>\n/g;' > $titles.xml.temp
#set +x
fi
if [ $? -ne 0 ]; then
echo "Error $? from curl, unable to get xml pages, bailing"
exit 1
fi
cat $titles.xml.temp >> $titles.xml
# get continue param
# format: <allpages apfrom="βήξιμο" />
apfrom=`grep apfrom $titles.xml.temp`
if [ -z "$apfrom" ]; then
break;
else
apfrom=`echo $apfrom | awk -F'"' '{ print $2 }' | sed -e 's/ /%20/g; s/&/%26/g; s/\#/%23/g;'`
fi
sleep 6
count=$(( $count+$step ))
done
# format <p pageid="37881" ns="0" title="βέρος" />
cat $titles.xml | grep '<p ' | awk -F'"' '{ print $6 }' | sed -e 's/^/[[/g; s/$/]]/g;' > $titles.txt
# done!
echo "done!"
exit 0