Re: rsync and debian -- summary of issues
On Fri, 2002-04-12 at 01:47, Martin Pool wrote:
> On 11 Apr 2002, Robert Tiberius Johnson <rtjohnso@cs.berkeley.edu> wrote:
> > Also, in section 3.15, you say, "Fetching a series of deltas which
> > update the same area is less efficient than calculating the deltas
> > directly." This is true, but by my computations, the number of packages
> > which change more than once in a month is 300, which require about 30K
> > of wasted bandwidth.
>
> Sorry, I don't understand what the 30kB number relates to. Is that
> only for the updates to the Packages file?
Yep. Sorry that wasn't clear before.
I implemented a local version of Goswin Brederlow's proposal to use the
reverse rsync algorithm over HTTP Range requests. The code is included
below: please look it over to make sure I got the protocol right.
Here's what I found by running the program on some Packages[.gz] files:
Using gzip -9 --rsyncable Packages files from unstable main:
From To blocksize diskspace bandwidth
---------------------------------------------------------
Apr 6 Apr 7 128 118K 882K
Apr 6 Apr 7 256 59K 841K
Apr 6 Apr 7 512 29K 839K
Apr 6 Apr 7 1024 15K 875K
Apr 6 Apr 11 128 118K 1900K
Apr 6 Apr 11 256 59K 1853K
Apr 6 Apr 11 512 29K 1839K
Apr 6 Apr 11 1024 15K 1845K
Using uncompressed Packages files from unstable main:
From To blocksize diskspace bandwidth
---------------------------------------------------------
Apr 6 Apr 7 256 212K 323K
Apr 6 Apr 7 512 106K 270K
Apr 6 Apr 7 1024 53K 292K
Apr 6 Apr 11 128 429K 898K
Apr 6 Apr 11 256 215K 860K
Apr 6 Apr 11 512 107K 985K
For comparison, 5 days of bzip2 -9 compressed diffs take about 60K of
disk space, and updating after one day uses about 13K of bandwidth.
Updating after 5 days uses 65K of bandwidth.
Best,
Rob
/* Mock-up version of the "checksum file" file synching protocol.
*
* compile with "gcc -lssl gen_csums.c -o gen_csums"
*
* Usage: gen_csums <bsize> <csize> <oldfile> <newfile>
*
* using a checksum size (csize) of < 8 can cause collisions,
* which will ruin file reconstruction. This implementation only supports
* csize of <= 8 to reduce memory usage. So always use csize=8.
*
* <oldfile> is the file the client has. <newfile> is the file
* the server has. The program produces the file of checksums (newfile.csums),
* a file containing the extra data the client had to fetch from the server
* (oldfile.data), and the result of patching oldfile using the
* checksums and fetched data (oldfile.merged).
*
* It prints out the blocksize, the amount of data that had to
* be transferred from the server, and whether reconstruction was
* successful.
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <assert.h>
#include <openssl/md5.h>
/* Generate the checksums file for the given file. */
void gen_csums (int bsize, int csize, char* ifilename, char* cfilename)
{
int ifile, cfile;
struct stat s;
char* buf;
char csumbuf[MD5_DIGEST_LENGTH];
MD5_CTX c;
int i;
ifile = open (ifilename, O_RDONLY);
cfile = open (cfilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
fstat (ifile, &s);
buf = (char*) malloc (s.st_size);
read (ifile, buf, s.st_size);
close (ifile);
write (cfile, &s.st_size, sizeof (s.st_size));
write (cfile, &bsize, sizeof (bsize));
write (cfile, &csize, sizeof (csize));
MD5 (buf, s.st_size, csumbuf);
write (cfile, csumbuf, sizeof (csumbuf));
for (i = 0; i < s.st_size; i += bsize)
{
MD5 (buf + i, bsize, csumbuf);
write (cfile, csumbuf, csize);
}
free (buf);
close (cfile);
}
/* Merging code */
struct block_info
{
int offset;
char csum[8];
};
int csize = 0;
/* Used for sorting the checksums, and doing binary search
* on checksums */
int block_info_cmp (void* a, void* b)
{
struct block_info* p1 = (struct block_info*)a;
struct block_info* p2 = (struct block_info*)b;
int r = memcmp (p1->csum, p2->csum, csize);
return r;
}
/* Given:
* cfilename - the checksum file
* pfilename - the new file on the server
* ifilename - our old file
* ofilename - the merged file to be created
* dfilename - the file extra data is to be logged in
*
* use the checksums to fill in as much of the merged file as possible.
* Then use the pfilename to fill in the rest. At the end, the file
* dfilename will contain a copy of all the extra data the client had
* to pull from pfilename to fill in the holes.
*/
int merge_csums (char* cfilename, char* pfilename,
char* ifilename, char* ofilename,
char* dfilename)
{
int bsize;
int cfile, pfile, ifile, ofile, dfile;
int ofilelen;
int inblocks;
struct stat s;
char* buf;
char* foundmap;
char tc;
struct block_info csum;
struct block_info* csums;
struct block_info* match;
MD5_CTX c;
char totalcsum[MD5_DIGEST_LENGTH];
char csumbuf[MD5_DIGEST_LENGTH];
int i, j;
cfile = open (cfilename, O_RDONLY);
pfile = open (pfilename, O_RDONLY);
ifile = open (ifilename, O_RDONLY);
ofile = open (ofilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
dfile = open (dfilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
fstat (ifile, &s);
read (cfile, &ofilelen, sizeof (ofilelen));
read (cfile, &bsize, sizeof (bsize));
read (cfile, &csize, sizeof (csize));
read (cfile, totalcsum, sizeof (totalcsum));
inblocks = s.st_size - bsize + 1;
foundmap = (char*) malloc (ofilelen);
memset (foundmap, 0, ofilelen);
/* Hash all our local blocks. Then sort them by hash */
csums = (struct block_info*) malloc (inblocks * sizeof (struct block_info));
buf = (char *) malloc (s.st_size);
read (ifile, buf, s.st_size);
for (i = 0; i < inblocks; i++)
{
csums[i].offset = i;
MD5 (buf + i, bsize, csumbuf);
memcpy (csums[i].csum, csumbuf, csize);
}
qsort (csums, inblocks, sizeof (struct block_info), block_info_cmp);
/* For each csum in the cfile, look for it in our list of checksums */
j = 0;
while ((i = read (cfile, csum.csum, csize)) == csize)
{
match = (struct block_info*) bsearch (&csum, csums, inblocks,
sizeof (struct block_info),
block_info_cmp);
if (match != NULL)
{
/* If we find a match, copy the corresponding
* position from our old file to the proper place
* in our new file. Also, mark this range of
* bytes as "found". */
lseek (ofile, j * bsize, SEEK_SET);
write (ofile, buf + match->offset, bsize);
memset (foundmap + j * bsize, 1, bsize);
}
j++;
}
/* Now find any holes in our output, and use the peekfile (pfile)
* to fill them in. These are the bytes we'd have to fetch from the
* server using http range requests. Store them in the data file for
* later reference, too. */
for (i = 0; i < ofilelen; i++)
{
if (foundmap[i] == 0)
{
lseek (pfile, i, SEEK_SET);
lseek (ofile, i, SEEK_SET);
read (pfile, &tc, sizeof (tc));
write (ofile, &tc, sizeof (tc));
write (dfile, &tc, sizeof (tc));
}
}
free (foundmap);
free (csums);
free (buf);
close (cfile);
close (pfile);
close (ifile);
close (ofile);
close (dfile);
/* Now checksum our newly reconstructed file and compare it
* to the server's checksum */
ofile = open (ofilename, O_RDONLY);
buf = (char *) malloc (ofilelen);
read (ofile, buf, ofilelen);
close (ofile);
MD5 (buf, ofilelen, csumbuf);
free (buf);
return memcmp (csumbuf, totalcsum, MD5_DIGEST_LENGTH);
}
int main (int argc, char** argv)
{
int bsize = atoi (argv[1]);
int csize = atoi (argv[2]);
char* oldfilename = argv[3];
char* newfilename = argv[4];
char csumsfilename[1024];
char datafilename[1024];
char mergedfilename[1024];
struct stat csums;
struct stat data;
int failed = 0;
assert (csize <= 8);
sprintf (csumsfilename, "%s.csums", newfilename);
sprintf (datafilename, "%s.data", oldfilename);
sprintf (mergedfilename, "%s.merged", oldfilename);
gen_csums (bsize, csize, newfilename, csumsfilename);
failed = merge_csums (csumsfilename, newfilename,
oldfilename, mergedfilename,
datafilename);
stat (csumsfilename, &csums);
stat (datafilename, &data);
/* Estimate the total bandwidth as:
* csums file size + data file size */
printf ("%d\t%d\t%d\t %s\n", bsize, csums.st_size, csums.st_size + data.st_size,
failed ? "failed" : "succeeded");
}
--
To UNSUBSCRIBE, email to debian-devel-request@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org
Reply to: