#!/usr/bin/perl -w

# TODO: I'd like to be able to tell it to get some extra files, by name.
# I'm thinking Contents files. It would be really nice if it could pull
# a whole directory -- think project/trace, or disks-i386...
# TODO: It would probably be cleaner and easier to learn if it took
# apt-style lines to tell where to mirror from and what portions to use.

=head1 NAME

debmirror - Debian partial mirror script, with ftp and package pool support

=head1 SYNOPSIS

debmirror mirrordir [options]

=head1 DESCRIPTION

This program downloads and maintains a partial local Debian mirror. It can
mirror any combination of architectures, distributions, and sections. Files
are transferred by ftp, and package pools are fully supported. It also does
locking and updates trace files.

To support package pools, this program mirrors in three steps.

=over 4

=item 1. download Packages and Sources files

First it downloads all Packages and Sources files for the subset of Debian it
was instructed to get.

=item 2. clean up unknown files

Any files and directories on the local mirror that are not in the list are
removed.

=item 3. download everything else

The Packages and Sources files are scanned, to build up a list of all the
files they refer to. A few other miscellaneous files are added to the list.
Then the program makes sure that each file in the list is present on the 
local mirror and is up-to-date, using file size (and optionally md5sum) checks.
Any necessary files are downloaded.

=back

=cut

sub usage {
        warn join(" ", @_)."\n" if @_;
        warn <<EOF;
Usage: $0 mirrordir [--debug] [--progress] [--source|--nosource]
          [--md5sums|--nomd5sums] [--passive] [--host=remotehost]
          [--user=remoteusername] [--method=ftp|rsync] [--timeout=seconds]
          [--root=directory] [--dist=foo[,bar,..] ...]
          [--section=foo[,bar,..] ...] [--arch=foo[,bar,..] ...]
          [--cleanup|--nocleanup] [--skippackages] [--adddir=directory]
	  [--ignore=regex] [--getcontents] [--exclude=regex] [--include=regex]
	  [--help]

For details, see man page.
EOF
        exit(1);
}

=head1 OPTIONS

=over 4

=item mirrordir

This required parameter specifies where the local mirror directory is. If the
directory does not exist, it will be created. Be careful; telling this 
program that your home directory is the mirrordir is guarenteed to replace
your home directory with a Debian mirror!

=item --debug

Enables verbose debug output, including ftp protocol dump.

=item --progress -p

Displays progress bars as files are downloaded.

=item --source

Include source in the mirror (default).

=item --nosource

Do not include source.

=item --md5sums -m

Use md5sums to determine if files on the local mirror that are the correct
size actually have the correct content. Not enabled by default, because
it is too paranoid, and too slow.

=item --passive

Download in passive mode.

=item --host=remotehost -h

Specify the remote host to mirror from. Defaults to ftp.debian.org,
you are strongly encouraged to find a closer mirror.

=item --user=remoteusername -u

Specify the remote user name to use to log to the remote host. Helpful when
dealing with braindamaged proxy servers. Defaults to anonymous.

=item --method=ftp|rsync -e

Specify the method to download files. Currentrly, supported methods are
ftp or rsync. To connect a rsync server, you need to put ':' prefix in
the root directory  (i.e. ":debian", which means host::debian).

=item --timeout=seconds -t

Specifies the timeout to use for network operations (either FTP or rsync).
Set this to a higher value if you experience failed downloads. Defaults
to 300 seconds.

=item --root=directory -r directory

Specifies the directory on the remote host that is the root of the Debian
archive. Defaults to "/debian", which will work for most mirrors. The root
directory has a dists/ subdirectory.

=item --dist=foo[,bar,..] -d foo

Specify the distribution (version) of Debian to mirror. This switch may be
used multiple times, and multiple distributions may be specified at once,
separated by commas. Defaults to mirroring unstable.

=item --section=foo[,bar,..] -s foo

Specify the section of Debian to mirror. Defaults to main,contrib,non-free.

Note that to mirror the debian-installer's part of the archive, you can
use 'main/debian-installer'

=item --arch=foo[,bar,..] -a foo

Specify the architectures to mirror. The default is --arch=i386.

=item --cleanup

Do clean up any unknown files and directories on the local mirror 
(see step 2 above). On by default.

=item --nocleanup

Do not clean up the local mirror after mirroring is complete.

=item --ignore=regex

Never delete any files whose filenames match the regex. May be used multiple times.

=item --exclude=regex

Never download any files whose filenames match the regex. May be used multiple times.

=item --include=regex

Don't exclude any files whose filenames match the regex. May be used multiple times.

=item --skippackages

Don't re-download Packages and Sources files. Useful if you know they are
up-to-date.

=item --adddir directory

Also download Packages and Sources files from the specified directory on
the remote host (the directory is relative to the root of the Debian
archive). For example, "--adddir dists/proposed-updates" will mirror the
proposed-updates directory.

=item --getcontents

Download Contents.arch.gz files.

=item --help

Display a usage summary.

=back

=head1 EXAMPLES

 debmirror /mirror/debian

Simply make a mirror in /mirror/debian, using all defaults.

 debmirror /mirror/debian --ignore=non-US/
 debmirror /mirror/debian/non-US -h non-us.debian.org -r /debian-non-US \
 	-s non-US/main,non-US/contrib,non-US/non-free

Make one full mirror, and suppliment it with a mirror of non-US, in a
directory inside.

 debmirror -a i386,sparc -s main -h ftp.kernel.org \
 	-d unstable -d testing /home/me/debian/mirror --nosource \
	--progress

Make a mirror of i386 and sparc binaries, main only, and include both unstable
and testing versions of Debian. Download from ftp.kernel.org.

 debmirror -e rsync -r :debian /home/me/debian/mirror

Make a mirror using rsync. rsync server is ftp.debian.org::debian.

=cut

use strict;
use Cwd;
use Net::FTP;
use Getopt::Long;
use File::Temp qw/ tempfile /;
use LockFile::Simple;
use Compress::Zlib;

# Yeah, I use too many global variables in this program.
my ($debug, $progress, $check_md5sums, $passive, $skippackages,
    $getcontents);
my (@dists, @sections, @arches, @extra_dirs, @ignores, @excludes, @includes);
my $cleanup=1;
my $do_source=1;
my $host="ftp.debian.org";
my $user="anonymous";
my $remoteroot="/debian";
my $download_method="ftp";
my $timeout=300;

# This hash holds all the files we know about, If the hash key is false,
# the file already exists in the mirror (or is locally created) and does not
# need to be downloaded, if it is true the file needs to be downloaded.
# Filenames should be relative to $mirrordir.
my %files;

my $help;
GetOptions(
	'debug'		=> \$debug,
	'progress|p'	=> \$progress,
	'verbose'	=> \$progress,
	'source!'	=> \$do_source,
	'md5sums|m'	=> \$check_md5sums,
	'passive!'	=> \$passive,
	'host|h=s'	=> \$host,
	'user|u=s'	=> \$user,
	'root|r=s' 	=> \$remoteroot,
	'dist|d=s'	=> \@dists,
	'section|s=s'	=> \@sections,
	'arch|a=s'	=> \@arches,
	'adddir=s'	=> \@extra_dirs,
	'cleanup!'	=> \$cleanup,
	'ignore=s' 	=> \@ignores,
	'exclude=s' 	=> \@excludes,
        'include=s'     => \@includes,
	'skippackages'	=> \$skippackages,
	'getcontents'	=> \$getcontents,
	'method|e=s'	=> \$download_method,
	'timeout|t=s'   => \$timeout,
	'help' => \$help,
) or usage;
usage if $help;

# This parameter is so important that it is the only required parameter.
my $mirrordir=shift or usage("mirrordir not specified");

# Post-process arrays. Allow commas to seperate values the user entered.
# If the user entered nothing, provide defaults.
@dists=split(/,/,join(',',@dists));
@dists=qw(unstable) unless @dists;
@sections=split(/,/,join(',',@sections));
@sections=qw(main contrib non-free) unless @sections;
@arches=split(/,/,join(',',@arches));
@arches=qw(i386) unless @arches;

# Display configuration.
debug("Mirroring to $mirrordir from $download_method://$user:$host/$remoteroot/");
debug("Arches: ".join(",", @arches));
debug("Dists: ".join(",", @dists));
debug("Sections: ".join(",", @sections));
debug("Including source.") if $do_source;
debug("Passive mode on.") if $passive;
debug("Checking md5sums.") if $check_md5sums;
debug("Will NOT clean up.") unless $cleanup;

my $md5;
if ($check_md5sums) {
	eval q{use Digest::MD5; $md5=Digest::MD5->new;};
	$md5=Digest::MD5->new;
}

# Set up mirror directory and resolve $mirrordir to a full path for
# locking and rsync
make_dir($mirrordir);
chdir($mirrordir) or die "chdir $mirrordir: $!";
$mirrordir = cwd();

# Handle the lock file. This is the same method used by official
# Debian push mirrors.
my $hostname=`hostname -f 2>/dev/null || hostname`;
chomp $hostname;
my $lockfile="Archive-Update-in-Progress-$hostname";
$files{$lockfile}=0;
my $lockmgr = LockFile::Simple->make(-format => "%f/$lockfile",
					-max => 4300, -delay => 10, -nfs => 1,
					-autoclean => 1, -warn => 0);
my $lock = $lockmgr->lock("$mirrordir")
	or die "$lockfile exists; aborting\n";
$SIG{INT}=sub { $lock->release; exit 1 };
$SIG{TERM}=sub { $lock->release; exit 1 };

# Register the trace file.
my $tracefile="project/trace/$hostname";
$files{$tracefile}=0;

# Start up ftp.
my $ftp;
my %opts = (Debug => $debug, Passive => $passive, Timeout => $timeout);

my $rsynctempfile;
END { unlink $rsynctempfile if $rsynctempfile }

INIT: {
	$_ = $download_method;

	/^ftp$/ && do {
		$ftp=Net::FTP->new($host, %opts) or die "$@\n";
		$ftp->login($user) or die "login failed"; # anonymous
		$ftp->binary or die "could not set binary mode";
		$ftp->cwd($remoteroot) or die "cwd to $remoteroot failed";
		$ftp->hash(*STDOUT,10240) if $progress;
		last INIT;
	};

	/^rsync$/ && do {
		$remoteroot = "$host:$remoteroot";
		if (! ($user eq 'anonymous')) {
			$remoteroot = "$user\@$remoteroot";
		}
		last INIT;
	};

        usage("unknown download method: $_");

}

# Get Packages and Sources files and other miscellany.
my (@package_files, @source_files);
foreach my $dist (@dists) {
	foreach my $section (@sections) {
		foreach my $arch (@arches) {
			get_packages("dists/$dist/$section/binary-$arch");
		}
		get_sources("dists/$dist/$section/source");
	}
	if ($getcontents) {
		foreach my $arch (@arches) {
			next if $arch=~/source/;
			remote_get("dists/$dist/Contents-$arch.gz");
			$files{"dists/$dist/Contents-$arch.gz"}=0;
		}
	}
	remote_get("dists/$dist/Release");
	remote_get("dists/$dist/Release.gpg");
	$files{"dists/$dist/Release"}=0;
	$files{"dists/$dist/Release.gpg"}=0;
}
foreach (@extra_dirs) {
	get_packages($_);
	get_sources($_);
}

# Sanity check. I once nuked a mirror because of this..
if (@arches && ! @package_files) {
	die "Failed to download any Packages files!\n";
}
if ($do_source && ! @source_files) {
	die "Failed to download any Sources files!\n";
}

# Parse Packages and Sources files and add to the file list everything therein.
{
	local $/="\n\n";
	my ($filename, $size, $md5sum, $directory, $exclude, $include,
	    $architecture);

	my %arches = map { $_ => 1 } (@arches, "all");
	
	$exclude =  "(".join("|", @excludes).")" if @excludes;
	$include =  "(".join("|", @includes).")" if @includes;
	foreach my $file (@package_files) {
		my $gunzf = gzopen($file, "rb") or die "$file: $!";
		my $line;
		my $res;
		my $loop = 1;
		while ($loop) {
		        my $buf = "";
		        while(($res = $gunzf->gzreadline($line) > 0)
			      && !($line =~ /^$/)) {
			    $buf = $buf . $line;
			}
			if ($res <= 0) {
                                $loop = 0;
                                next;
                        }
			$_ = $buf;    
			($filename)=m/^Filename:\s+(.*)/im;
			($architecture)=m/^Architecture:\s+(.*)/im;
			next if (!$arches{$architecture});
			if(!(defined($include) && ($filename=~/$include/o))) {
			    next if (defined($exclude) && $filename=~/$exclude/o);
			}
			($size)=m/^Size:\s+(\d+)/im;
			($md5sum)=m/^MD5sum:\s+([A-Za-z0-9]+)/im
				if $check_md5sums;
			$files{$filename}=check_file($filename, $size, $md5sum);
		}
		$gunzf->gzclose();
	}
	foreach my $file (@source_files) {
		my $gunzf = gzopen($file, "rb") or die "$file: $!";
		my $line;
		my $res;
		my $loop = 1;
		while ($loop) {
		        my $buf = "";
		        while(($res = $gunzf->gzreadline($line) > 0)
			      && !($line =~ /^$/)) {
			    $buf = $buf . $line;
			}
			if ($res <= 0) {
                                $loop = 0;
                                next;
                        }
			$_ = $buf;    
			($directory) = m/^Directory:\s+(.*)/im;
			while (m/^ ([A-Za-z0-9]{32} .*)/mg) {
				($md5sum, $size, $filename)=split(' ', $1, 3);
				$filename="$directory/$filename";
			if(!(defined($include) && ($filename=~/$include/o))) {					    next if (defined($exclude) && $filename=~/$exclude/o);
				}
				$files{$filename}=check_file($filename, $size, $md5sum);
			}
		}
		$gunzf->gzclose();
	}
}

if ($cleanup) {
	my $ignore;
	$ignore = "(".join("|", @ignores).")" if @ignores;
	# Remove all files in the mirror that we don't know about
	foreach my $file (`find . -type f`) {
		chomp $file;
		$file=~s:^\./::;
		unless (exists $files{$file} or (defined($ignore) && $file=~/$ignore/o)) {
			debug("deleting $file");
			unlink $file or die "unlink $file: $!";
		}
	}
	# Remove all empty directories. Not done as part of main cleanup
	# to prevent race problems with pool download code, which
	# makes directories.. Sort so they are removable in bottom-up
	# order.
	system("find . -type d ! -name . ! -name .. | sort -r | xargs rmdir 2>/dev/null");
}

# Download all files that we need to get.
DOWNLOAD: {
	$_ = $download_method;
	# Ftp method
	/^ftp$/ && do {
		my $dirname;
		foreach my $file (sort keys %files) {
			if ($files{$file}) {
				($dirname) = $file =~ m:(.*)/:;
				make_dir($dirname);
				ftp_get($file);
			}
		}
		last DOWNLOAD;
	};

	# Rsync method
	/^rsync$/ && do {
		my $fh;
		($fh, $rsynctempfile) = tempfile();
		my $opt="";
		my @result;
		$opt = "--progress" if $progress;
		$opt = "$opt -v" if $debug;
		foreach my $file (sort keys %files) {
			if ($files{$file}) {
				my $dirname;
				my @dir;
				($dirname) = $file =~ m:(.*/):;
                                @dir= split(/\//, $dirname);
                                for (0..$#dir) {
					push (@result, "" . join('/', @dir[0..$_]) . "/");
                                }
				push (@result, "$file");
			}
		}
		if (@result) {
			@result = sort(@result);
        		my $prev = "not equal to $result[0]";
        		@result = grep($_ ne $prev && ($prev = $_, 1), @result);
			for (@result) {
				print $fh "$_\n";
			}
		}
		system ("rsync -az --timeout=$timeout $opt $remoteroot --include-from=$rsynctempfile --exclude='*' $mirrordir");
		close $fh;
		last DOWNLOAD;
	};

}


# Finish up. Write out trace file.
if ($download_method eq 'ftp') { $ftp->quit; }
make_dir("project/trace");
open OUT, ">$tracefile" or die "$tracefile: $!";
print OUT `date -u`;
close OUT;
$lock->release;
exit;

# Pass this function a filename, a file size (bytes), and a md5sum (hex).
# It will return true if the file needs to be downloaded.
sub check_file {
	my ($filename, $size, $md5sum)=@_;
	if (-f $filename and $size == -s _) {
		if ($check_md5sums) {
			open HANDLE, $filename or
				die "$filename: $!";
			$md5->addfile(*HANDLE);
			if ($md5sum eq $md5->hexdigest) {
				return 0;
			}
		}
		else {
			# Assume it is ok, w/o md5 check.
			return 0;
		}
	}
	return 1;
}


sub remote_get {
	my $file=shift;
	METHOD: {
		$_ = $download_method;
		/^ftp$/ && do {
			return ftp_get($file);
		};

		/^rsync$/ && do {
			return rsync_get($file);
		};
	}
}

# Get a file via ftp, first displaying its filename if progress is on.
# I should just be able to subclass Net::Ftp and override the get method,
# but it's late.
sub ftp_get {
	my $oldautoflush = $|;
	$| = 1;
	my $file=shift;
	print "$file: " if $progress;
	my $ret=$ftp->get($file, $file);
	if ($ret) {
		my $mtime=$ftp->mdtm($file);
		utime($mtime, $mtime, $file) if defined $mtime;
	}
	else {
		warn "$file failed\n";
	}
	$| = $oldautoflush;
	return $ret;
}

sub rsync_get {
	my $file=shift;
	my $opt="";
	(my $dirname) = $file =~ m:(.*/):;
	my @dir= split(/\//, $dirname);
	for (0..$#dir) {
		$opt = "$opt --include=" . join('/', @dir[0..$_]) . "/";
	}
	$opt = "$opt --progress" if $progress;
	$opt = "$opt -v" if $debug;
	print "$file: " if $progress;
	system ("rsync -az --timeout=$timeout $opt $remoteroot --include=$file --exclude='*' $mirrordir");
	return 1 if $? == 0;
}

# Get Packages file in the passed subdirectory.
sub get_packages {
	my $subdir=shift;
	make_dir($subdir);
	if ($skippackages) {
		push @package_files, "$subdir/Packages.gz";
	}
	else {
		remote_get("$subdir/Packages.gz") and
			push @package_files, "$subdir/Packages.gz";
		remote_get("$subdir/Release"); # optional
	}
	$files{"$subdir/Packages.gz"}=0;
	$files{"$subdir/Release"}=0;
}

# Get Sources file
sub get_sources {
	my $subdir=shift;
	
	if ($do_source) {
		make_dir($subdir);
		if ($skippackages) {
			push @source_files, "$subdir/Sources.gz";
		}
		else {
			remote_get("$subdir/Sources.gz") and
				push @source_files, "$subdir/Sources.gz";
		}
		$files{"$subdir/Sources.gz"}=0;
	}
}

# Make a directory including all needed parents.
{
	my %seen;
	
	sub make_dir {
		my $dir=shift;
		
		my @parts=split('/', $dir);
		my $current='';
		foreach my $part (@parts) {
			$current.="$part/";
			if (! $seen{$current}) {
				mkdir ($current, 0755);
				$seen{$current}=1;
			}
		}
	}
}

sub debug {
	print $0.': '.join(' ', @_)."\n" if $debug;
}

=head1 COPYRIGHT

This program is copyright 2001 by Joey Hess <joeyh@debian.org>, under
the terms of the GNU GPL.

The author disclaims any responsibility for any mangling of your system,
unexpected bandwidth usage bills, meltdown of the Debian mirror network, 
etc, that this script may cause. See NO WARRANTY section of GPL.

=head1 AUTHOR

Joey Hess <joeyh@debian.org>

=head1 MOTTO

Waste bandwith -- put a partial mirror on your laptop today!

=cut

