Re: Bug#893397: adapt the cron scripts to use http:// instead of ftp://
Hello
I've been doing some tests and this is what I have, for now:
* the current script is in:
https://anonscm.debian.org/cgit/debwww/cron.git/tree/parts/1ftpfiles
(and it works, not sure when/if it will stop working...).
* There, changing ftp:// for http:// is not enough, because we were getting
files using wildcards (which I guess worked for ftp:// but it does not work for
http://).
* Relevant code is a function "wgetfiles" which is called in 3 ways:
wgetfiles "" "" ftp://${ftpsite}/debian/doc/ 2 doc
wgetfiles emacsen-common emacsen-common_*.deb
wgetfiles dpkg dpkg-doc_1.9.21_all.deb
http://snapshot.debian.org/archive/debian/20050312T000000Z/pool/main 7
And the specific wget call inside the function wgetfiles is, currently:
wget --timeout=60 --quiet --recursive --timestamping --no-host-directories \
--cut-dirs=${cutdirs} --directory-prefix=${prefix} \
${ftpurl}/${initial}/${namesrc}/${namebin}
I've learned that with wget we can use wildcards with --recursive and -A
"pattern" and we should probably add --no-parent to avoid the recursive go to
other places.
Then I've transformed the wget call into this:
wget -e robots=off --no-parent --timeout=60 --recursive --timestamping
--no-host-directories \
--cut-dirs=${cutdirs} --directory-prefix=${prefix} \
--reject "*.html*" -A "${namebin}" ${ftpurl}/${initial}/${namesrc}/
(see also the complete diff and the 'new' 1ftpfiles, attached).
This seems to work (I've run the script in local and later checked that the
files were downloaded in /srv/www.debian.org/cron/ftpfiles), but it needs
improvements, because:
* I've workarounded the robots.txt with "-e robots=off" but I guess that this is
not the correct/elegant/respectful way?
* wget downloads all the files and then removes the ones that don't match the
pattern specified with -A. Maybe there is a more efficient way to do this?
Cheers
--
Laura Arjona Reina
https://wiki.debian.org/LauraArjona
diff --git a/parts/1ftpfiles b/parts/1ftpfiles
index f4fcade..3bd4229 100755
--- a/parts/1ftpfiles
+++ b/parts/1ftpfiles
@@ -14,7 +14,7 @@ wget --timeout=60 --quiet --timestamping http://${ftpsite}/debian/indices/Mainta
[ -d $webtopdir/webwml/english/devel/wnpp ] || mkdir -p $webtopdir/webwml/english/devel/wnpp
ln -sf $crondir/ftpfiles/Maintainers $webtopdir/webwml/english/devel/wnpp/Maintainers
-ftpurlmain=ftp://${ftpsite}/debian/pool/main
+ftpurlmain=http://${ftpsite}/debian/pool/main
wgetfiles()
{
namesrc=$1 # source package name: dpkg
@@ -24,9 +24,9 @@ cutdirs=${4:-5} # number of / in <directories> to drop + 3: default 5
prefix=${5:-pool} # download directory
echo -n " ${namesrc}"
initial=$(echo "${namesrc}"|sed -e "s/^\(.\).*$/\1/")
-wget --timeout=60 --quiet --recursive --timestamping --no-host-directories \
+wget -e robots=off --no-parent --timeout=60 --recursive --timestamping --no-host-directories \
--cut-dirs=${cutdirs} --directory-prefix=${prefix} \
- ${ftpurl}/${initial}/${namesrc}/${namebin}
+ --reject "*.html*" -A "${namebin}" ${ftpurl}/${initial}/${namesrc}/
}
# needed for 7doc_updates
@@ -34,7 +34,7 @@ wget --timeout=60 --quiet --recursive --timestamping --no-host-directories \
# Refresh $crondir/ftpfiles/doc
rm -rf $crondir/ftpfiles/doc
-wgetfiles "" "" ftp://${ftpsite}/debian/doc/ 2 doc
+wgetfiles "" "" http://${ftpsite}/debian/doc/ 2 doc
# Refresh $crondir/ftpfiles/pool
rm -rf $crondir/ftpfiles/pool
#!/bin/sh -e
# this script fetches some stuff from the FTP stuff that's needed
# and puts it in /srv/www.debian.org/cron/ftpfiles
. `dirname $0`/../common.sh
[ -d $crondir/ftpfiles ] || mkdir -p $crondir/ftpfiles
cd $crondir/ftpfiles
ftpsite=ftp.de.debian.org
# needed for WNPP, webwml/english/devel/wnpp/wnpp.pl
wget --timeout=60 --quiet --timestamping http://${ftpsite}/debian/indices/Maintainers
[ -d $webtopdir/webwml/english/devel/wnpp ] || mkdir -p $webtopdir/webwml/english/devel/wnpp
ln -sf $crondir/ftpfiles/Maintainers $webtopdir/webwml/english/devel/wnpp/Maintainers
ftpurlmain=http://${ftpsite}/debian/pool/main
wgetfiles()
{
namesrc=$1 # source package name: dpkg
namebin=$2 # binary package name (glob): dpkg-doc_*.deb
ftpurl=${3:-$ftpurlmain} # ${ftpsite}/<directories>
cutdirs=${4:-5} # number of / in <directories> to drop + 3: default 5
prefix=${5:-pool} # download directory
echo -n " ${namesrc}"
initial=$(echo "${namesrc}"|sed -e "s/^\(.\).*$/\1/")
wget -e robots=off --no-parent --timeout=60 --recursive --timestamping --no-host-directories \
--cut-dirs=${cutdirs} --directory-prefix=${prefix} \
--reject "*.html*" -A "${namebin}" ${ftpurl}/${initial}/${namesrc}/
}
# needed for 7doc_updates
# this is FTP because otherwise we get all those ugly HTML thingies
# Refresh $crondir/ftpfiles/doc
rm -rf $crondir/ftpfiles/doc
wgetfiles "" "" http://${ftpsite}/debian/doc/ 2 doc
# Refresh $crondir/ftpfiles/pool
rm -rf $crondir/ftpfiles/pool
wgetfiles emacsen-common emacsen-common_*.deb
wgetfiles build-essential build-essential_*_amd64.deb
wgetfiles dpkg dpkg-doc_*.deb
wgetfiles menu menu_*_amd64.deb
wgetfiles java-policy java-policy_*.deb
wgetfiles debian-policy debian-policy_*.deb
wgetfiles python-defaults python_*_amd64.deb
wgetfiles developers-reference developers-reference*.deb
wgetfiles packaging-tutorial packaging-tutorial_*.deb
wgetfiles refcard debian-refcard_*.deb
wgetfiles debian-faq debian-faq*.deb # including debian-faq-fr etc.
wgetfiles maint-guide maint-guide*.deb
wgetfiles debian-reference debian-reference*.deb
wgetfiles debmake-doc debmake-doc_*.deb
wgetfiles apt apt-doc_*.deb
wgetfiles aptitude aptitude-doc-*.deb
wgetfiles kernel-handbook debian-kernel-handbook_*.deb
wgetfiles debian-handbook debian-handbook_*.deb
wgetfiles dbconfig-common dbconfig-common_*.deb
# from obsolete dpkg-doc from snapshot
wgetfiles dpkg dpkg-doc_1.9.21_all.deb http://snapshot.debian.org/archive/debian/20050312T000000Z/pool/main 7
# Installation guide is taken from source package in lessoften-parts/1installation-guide
wgetfiles installation-guide installation-guide_*
echo
echo "1ftpfiles finished (at `date`)"
echo
Reply to: