Re: rsync and debian -- summary of issues

To: debian-devel@lists.debian.org
Subject: Re: rsync and debian -- summary of issues
From: Robert Tiberius Johnson <rtjohnso@cs.berkeley.edu>
Date: 12 Apr 2002 17:34:53 -0700
Message-id: <[🔎] 1018658094.11375.100.camel@wonne>
In-reply-to: <[🔎] 20020412084732.GC16522@samba.org>
References: <[🔎] 20020411081536.GE2856@samba.org> <[🔎] 1018594087.8317.109.camel@wonne> <[🔎] 20020412084732.GC16522@samba.org>

On Fri, 2002-04-12 at 01:47, Martin Pool wrote:
> On 11 Apr 2002, Robert Tiberius Johnson <rtjohnso@cs.berkeley.edu> wrote:
> > Also, in section 3.15, you say, "Fetching a series of deltas which
> > update the same area is less efficient than calculating the deltas
> > directly."  This is true, but by my computations, the number of packages
> > which change more than once in a month is 300, which require about 30K
> > of wasted bandwidth. 
> 
> Sorry, I don't understand what the 30kB number relates to.  Is that
> only for the updates to the Packages file?

Yep.  Sorry that wasn't clear before.

I implemented a local version of Goswin Brederlow's proposal to use the
reverse rsync algorithm over HTTP Range requests.  The code is included
below: please look it over to make sure I got the protocol right.

Here's what I found by running the program on some Packages[.gz] files:

Using gzip -9 --rsyncable Packages files from unstable main:
From	To	blocksize	diskspace	bandwidth
---------------------------------------------------------
Apr 6	Apr 7	128		118K		882K
Apr 6	Apr 7	256		59K		841K
Apr 6	Apr 7	512		29K		839K
Apr 6	Apr 7	1024		15K		875K
Apr 6	Apr 11	128		118K		1900K
Apr 6	Apr 11	256		59K		1853K
Apr 6	Apr 11	512		29K		1839K
Apr 6	Apr 11	1024		15K		1845K

Using uncompressed Packages files from unstable main:
From	To	blocksize	diskspace	bandwidth
---------------------------------------------------------
Apr 6	Apr 7	256		212K		323K
Apr 6	Apr 7	512		106K		270K
Apr 6	Apr 7	1024		53K		292K
Apr 6	Apr 11	128		429K		898K
Apr 6	Apr 11	256		215K		860K
Apr 6	Apr 11	512		107K		985K

For comparison, 5 days of bzip2 -9 compressed diffs take about 60K of
disk space, and updating after one day uses about 13K of bandwidth. 
Updating after 5 days uses 65K of bandwidth.

Best,
Rob

/* Mock-up version of the "checksum file" file synching protocol.
 * 
 * compile with "gcc -lssl gen_csums.c -o gen_csums"
 * 
 * Usage: gen_csums <bsize> <csize> <oldfile> <newfile>
 *
 * using a checksum size (csize) of < 8 can cause collisions,
 * which will ruin file reconstruction.  This implementation only supports
 * csize of <= 8 to reduce memory usage.  So always use csize=8.
 *
 * <oldfile> is the file the client has.  <newfile> is the file
 * the server has.  The program produces the file of checksums (newfile.csums),
 * a file containing the extra data the client had to fetch from the server
 * (oldfile.data), and the result of patching oldfile using the
 * checksums and fetched data (oldfile.merged).
 *
 * It prints out the blocksize, the amount of data that had to
 * be transferred from the server, and whether reconstruction was 
 * successful.
 */


#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <assert.h>
#include <openssl/md5.h>

/* Generate the checksums file for the given file. */
void gen_csums (int bsize, int csize, char* ifilename, char* cfilename)
{
	int ifile, cfile;
	struct stat s;
	
	char* buf;
	char csumbuf[MD5_DIGEST_LENGTH];
	MD5_CTX c;
	
	int i;
	
	ifile = open (ifilename, O_RDONLY);
	cfile = open (cfilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
	fstat (ifile, &s);
	buf = (char*) malloc (s.st_size);
	read (ifile, buf, s.st_size);
	close (ifile);
	
	write (cfile, &s.st_size, sizeof (s.st_size));
	write (cfile, &bsize, sizeof (bsize));
	write (cfile, &csize, sizeof (csize));
	MD5 (buf, s.st_size, csumbuf);
	write (cfile, csumbuf, sizeof (csumbuf));
	
	for (i = 0; i < s.st_size; i += bsize)
	{
			MD5 (buf + i, bsize, csumbuf);
			write (cfile, csumbuf, csize);
	}

	free (buf);
	close (cfile);
}

/* Merging code */

struct block_info
{
	int offset;
	char csum[8];
};

int csize = 0;

/* Used for sorting the checksums, and doing binary search
 * on checksums */
int block_info_cmp (void* a, void* b)
{
	struct block_info* p1 = (struct block_info*)a;
	struct block_info* p2 = (struct block_info*)b;
	int r = memcmp (p1->csum, p2->csum, csize);
	return r;
}

/* Given:
 * 		cfilename - the checksum file
 * 		pfilename - the new file on the server
 * 		ifilename - our old file
 * 		ofilename - the merged file to be created
 * 		dfilename - the file extra data is to be logged in
 *
 * use the checksums to fill in as much of the merged file as possible.
 * Then use the pfilename to fill in the rest.  At the end, the file
 * dfilename will contain a copy of all the extra data the client had 
 * to pull from pfilename to fill in the holes.
 */
int merge_csums (char* cfilename, char* pfilename, 
				  char* ifilename, char* ofilename,
				  char* dfilename)
{
	int bsize;

	int cfile, pfile, ifile, ofile, dfile;
	int ofilelen;
	int inblocks;
    struct stat s;

    char* buf;
	char* foundmap;
	char tc;
	struct block_info csum;
    struct block_info* csums;
	struct block_info* match;
    MD5_CTX c;
	char totalcsum[MD5_DIGEST_LENGTH];
	char csumbuf[MD5_DIGEST_LENGTH];

    int i, j;

    cfile = open (cfilename, O_RDONLY);
    pfile = open (pfilename, O_RDONLY);
    ifile = open (ifilename, O_RDONLY);
    ofile = open (ofilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
    dfile = open (dfilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
    fstat (ifile, &s);
	
    read (cfile, &ofilelen, sizeof (ofilelen));
    read (cfile, &bsize, sizeof (bsize));
    read (cfile, &csize, sizeof (csize));
	read (cfile, totalcsum, sizeof (totalcsum));
	inblocks = s.st_size - bsize + 1;

	foundmap = (char*) malloc (ofilelen);
	memset (foundmap, 0, ofilelen);

	/* Hash all our local blocks.  Then sort them by hash */
	csums = (struct block_info*) malloc (inblocks * sizeof (struct block_info));
	buf = (char *) malloc (s.st_size);
	read (ifile, buf, s.st_size);
	for (i = 0; i < inblocks; i++)
	{
		csums[i].offset = i;
		MD5 (buf + i, bsize, csumbuf);
		memcpy (csums[i].csum, csumbuf, csize);
	}
	qsort (csums, inblocks, sizeof (struct block_info), block_info_cmp);

	/* For each csum in the cfile, look for it in our list of checksums */
	j = 0;
	while ((i = read (cfile, csum.csum, csize)) == csize)
	{
		match = (struct block_info*) bsearch (&csum, csums, inblocks, 
											  sizeof (struct block_info),
											  block_info_cmp);
		if (match != NULL)
		{
			/* If we find a match, copy the corresponding 
			 * position from our old file to the proper place
			 * in our new file.  Also, mark this range of
			 * bytes as "found". */
			lseek (ofile, j * bsize, SEEK_SET);
			write (ofile, buf + match->offset, bsize);
			memset (foundmap + j * bsize, 1, bsize);
		}
		
		j++;
	}

	/* Now find any holes in our output, and use the peekfile (pfile)
	 * to fill them in. These are the bytes we'd have to fetch from the
	 * server using http range requests.  Store them in the data file for 
	 * later reference, too. */
	for (i = 0; i < ofilelen; i++)
	{
		if (foundmap[i] == 0)
		{
			lseek (pfile, i, SEEK_SET);
			lseek (ofile, i, SEEK_SET);
			read (pfile, &tc, sizeof (tc));
			write (ofile, &tc, sizeof (tc));
			write (dfile, &tc, sizeof (tc));
		}
	}

	free (foundmap);
	free (csums);
	free (buf);
	
    close (cfile);
    close (pfile);
    close (ifile);
    close (ofile);
    close (dfile);

	/* Now checksum our newly reconstructed file and compare it
	 * to the server's checksum */
	ofile = open (ofilename, O_RDONLY);
	buf = (char *) malloc (ofilelen);
	read (ofile, buf, ofilelen);
	close (ofile);
	MD5 (buf, ofilelen, csumbuf);
	free (buf);

	return memcmp (csumbuf, totalcsum, MD5_DIGEST_LENGTH);
}

int main (int argc, char** argv)
{
	int bsize = atoi (argv[1]);
	int csize = atoi (argv[2]);
	char* oldfilename = argv[3];
	char* newfilename = argv[4];
	char csumsfilename[1024];
	char datafilename[1024];
	char mergedfilename[1024];
	
	struct stat csums;
	struct stat data;
	
	int failed = 0;

	assert (csize <= 8);
	
	sprintf (csumsfilename, "%s.csums", newfilename);
	sprintf (datafilename, "%s.data", oldfilename);
	sprintf (mergedfilename, "%s.merged", oldfilename);

	gen_csums (bsize, csize, newfilename, csumsfilename);
	failed = merge_csums (csumsfilename, newfilename, 
						   oldfilename, mergedfilename,
						   datafilename);

	stat (csumsfilename, &csums);
	stat (datafilename, &data);

	/* Estimate the total bandwidth as:
	 * csums file size + data file size */
	printf ("%d\t%d\t%d\t %s\n", bsize, csums.st_size, csums.st_size + data.st_size,
							 failed ? "failed" : "succeeded");
}


-- 
To UNSUBSCRIBE, email to debian-devel-request@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org

Reply to:

References:
- rsync and debian -- summary of issues
  - From: Martin Pool <mbp@samba.org>
- Re: rsync and debian -- summary of issues
  - From: Robert Tiberius Johnson <rtjohnso@cs.berkeley.edu>
- Re: rsync and debian -- summary of issues
  - From: Martin Pool <mbp@samba.org>

Prev by Date: Re: Debian's problems, Debian's future
Next by Date: Re: g++-3.0 library support?
Previous by thread: Re: rsync and debian -- summary of issues
Next by thread: Re: rsync and debian -- summary of issues
Index(es):
- Date
- Thread