[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Bug#605009: serious performance regression with ext4



I did some experimenting, and I figured out what was going on.  You're
right, (c) doesn't quite work, because delayed allocation meant that
the writeout didn't take place until the fsync() for each file
happened.  I didn't see this at first; my apologies.

However, this *does* work:

    extract(a);
    sync_file_range(fd.a, 0, 0, SYNC_FILE_RANGE_WRITE); 
    extract(b.dpkg-new);
    sync_file_range(fd.b, 0, 0, SYNC_FILE_RANGE_WRITE); 
    extract(c.dpkg-new);
    sync_file_range(fd.c, 0, 0, SYNC_FILE_RANGE_WRITE); 

    sync_file_range(fd.a, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE); 
    sync_file_range(fd.b, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE); 
    sync_file_range(fd.c, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE); 

    fdatasync(a);
    fdatasync(b.dpkg-new);
    fdatasync(c.dpkg-new);

    rename(b.dpkg-new, b);
    rename(c.dpkg-new, c);

This assumes that files b and c existed beforehand, but a is a new file.

What's going on here?  sync_file_range() is a Linux specific system
call that has been around for a while.  It allows program to control
when writeback happens in a very low-level fashion.  The first set of
sync_file_range() system calls causes the system to start writing back
each file once it has finished being extracted.  It doesn't actually
wait for the write to finish; it just starts the writeback.

The second series of sync_file_range() calls, with the operation
SYNC_FILE_RANGE_WAIT_BEFORE, will block until the previously initiated
writeback has completed.  This basically ensures that the delayed
allocation has been resolved; that is, the data blocks have been
allocated and written, and the inode updated (in memory), but not
necessarily pushed out to disk.

The fdatasync() call will actually force the inode to disk.  In the
case of the ext4 file system, the first fdatasync() will actually push
all of the inodes to disk, and all of the subsequent fdatasync() calls
are in fact no-ops (assuming that files 'a', 'b', and 'c' are all on
the same file system).  But what it means is that it minimizes the
number of (heavyweight) jbd2 commits to a minimum.

It uses a linux-specific system call --- sync_file_range --- but the
result should be faster performance across the board for all file
systems.  So I don't consider this an ext4-specific hack, although it
probably does makes things faster for ext4 more than any other file
system.

I've attached the program I used to test and prove this mechanism, as
well as the kernel tracepoint script I used to debug why (c) wasn't
working, which might be of interest to folks on debian-kernel.
Basically it's a demonstration of how cool ftrace is.  :-)

But using this program on a file system composed of a 5400rpm laptop
drive running LVM and LUKS, I get:

mass-sync-tester -d:	dpkg current: time:  0.83/ 0.01/ 0.00

versus

mass-sync-tester -n:	dpkg fixed: time:  0.07/ 0.00/ 0.01

	       	 	       	      	   - Ted

/*
 * Mass sync tester
 */

#define _GNU_SOURCE

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/resource.h>
#include <getopt.h>
#include <errno.h>
#include <string.h>

void write_file(const char *name, int sync, int sync_range)
{
	int	fd, i, ret;
	char	buf[1024];

	fd = open(name, O_WRONLY|O_TRUNC|O_CREAT, 0666);
	if (fd < 0) {
		fprintf(stderr, "open(%s) in write_file: %s\n",
			name, strerror(errno));
		exit(1);
	}
	memset(buf, 0, sizeof(buf));
	for (i=0; i < 16; i++) {
		ret = write(fd, buf, sizeof(buf));
		if (ret < 0) {
			fprintf(stderr, "writing %s: %s\n",
				name, strerror(errno));
			exit(1);
		}
	}
	if (sync) {
		ret = fsync(fd);
		if (ret < 0) {
			fprintf(stderr, "fsyncing %s in write_file: %s\n",
				name, strerror(errno));
			exit(1);
		}
	}
	if (sync_range) {
		ret = sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
		if (ret < 0) {
			fprintf(stderr, "sync_file_range %s in write_file: %s\n",
				name, strerror(errno));
			exit(1);
		}
	}
	ret = close(fd);
	if (ret < 0) {
		fprintf(stderr, "closing %s in write_file: %s\n",
			name, strerror(errno));
		exit(1);
	}
}

void rename_file(const char *src, const char *dest)
{
	int ret;

	ret = rename(src, dest);
	if (ret) {
		fprintf(stderr, "renaming %s to %s: %s\n", src, dest,
			strerror(errno));
		exit(1);
	}
}

void sync_file(const char *name)
{
	int	fd, i, ret;

	fd = open(name, O_RDONLY|O_NOATIME, 0666);
	if (fd < 0) {
		fprintf(stderr, "open(%s) in sync_file: %s\n",
			name, strerror(errno));
		exit(1);
	}
	ret = fsync(fd);
	if (ret < 0) {
		fprintf(stderr, "fsyncing %s in sync_file: %s\n",
			name, strerror(errno));
		exit(1);
	}
	ret = close(fd);
	if (ret < 0) {
		fprintf(stderr, "closing %s in sync_file: %s\n",
			name, strerror(errno));
		exit(1);
	}
}

void datasync_file(const char *name)
{
	int	fd, i, ret;

	fd = open(name, O_RDONLY|O_NOATIME, 0666);
	if (fd < 0) {
		fprintf(stderr, "open(%s) in datasync_file: %s\n",
			name, strerror(errno));
		exit(1);
	}
	ret = fdatasync(fd);
	if (ret < 0) {
		fprintf(stderr, "fsyncing %s in datasync_file: %s\n",
			name, strerror(errno));
		exit(1);
	}
	ret = close(fd);
	if (ret < 0) {
		fprintf(stderr, "closing %s in datasync_file: %s\n",
			name, strerror(errno));
		exit(1);
	}
}

void sync_range(const char *name)
{
	int	fd, i, ret;

	fd = open(name, O_RDONLY|O_NOATIME, 0666);
	if (fd < 0) {
		fprintf(stderr, "open(%s) in sync_range: %s\n",
			name, strerror(errno));
		exit(1);
	}
	ret = sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE);
	if (ret < 0) {
		fprintf(stderr, "sync_file_range %s in sync_range: %s\n",
			name, strerror(errno));
		exit(1);
	}
	ret = close(fd);
	if (ret < 0) {
		fprintf(stderr, "closing %s in sync_range: %s\n",
			name, strerror(errno));
		exit(1);
	}
}

/*
 * Begin resource tracking code
 */

struct resource_track {
	struct timeval time_start;
	struct timeval user_start;
	struct timeval system_start;
};

void init_resource_track(struct resource_track *track)
{
	struct rusage r;

	gettimeofday(&track->time_start, 0);
	getrusage(RUSAGE_SELF, &r);
	track->user_start = r.ru_utime;
	track->system_start = r.ru_stime;
}

static inline float timeval_subtract(struct timeval *tv1,
				       struct timeval *tv2)
{
	return ((tv1->tv_sec - tv2->tv_sec) +
		((float) (tv1->tv_usec - tv2->tv_usec)) / 1000000);
}

void print_resource_track(const char *desc, struct resource_track *track)
{
	struct rusage r;
	struct timeval time_end;

	gettimeofday(&time_end, 0);

	if (desc)
		printf("%s: ", desc);

	getrusage(RUSAGE_SELF, &r);

	printf("time: %5.2f/%5.2f/%5.2f\n",
	       timeval_subtract(&time_end, &track->time_start),
	       timeval_subtract(&r.ru_utime, &track->user_start),
	       timeval_subtract(&r.ru_stime, &track->system_start));
}


/*
 * End resource tracking code
 */

const char *setup[] =
{ "exist_1", "exist_2", "exist_3", "exist_4", "exist_5",
  "exist_6", "exist_7", "exist_8", "exist_9", "exist_10", 0
};

const char *pkg_list[] =
{ "exist_1", "new_1", "exist_2", "exist_3", "new_2", "new_3",
  "exist_4", "exist_5", "exist_6", "new_4", "new_5", "new_6",
  "exist_7", "exist_8", "exist_9", "exist_10", "new_7", "new_8",
  "new_9", "new_10", 0
};

/*
 * Do the setup for a test
 */
void do_setup(const char *dirname)
{
	int ret, i;

	ret = mkdir(dirname, 0700);
	if (ret < 0) {
		fprintf(stderr, "mkdir(%s) in do_setup: %s\n",
			dirname, strerror(errno));
		exit(1);
	}
	ret = chdir(dirname);
	if (ret < 0) {
		fprintf(stderr, "chdir(%s) in do_setup: %s\n",
			dirname, strerror(errno));
		exit(1);
	}
	for (i=0; setup[i]; i++) {
		write_file(setup[i], 1, 0);
		sync_file(setup[i]);
	}
}

void do_cleanup(const char *dirname)
{
	char buf[1024];
	int ret;

	ret = chdir("..");
	if (ret < 0) {
		fprintf(stderr, "chdir(..) in do_cleanup: %s\n",
			dirname, strerror(errno));
		exit(1);
	}
	sprintf(buf, "/bin/rm -rf %s\n", dirname);
	ret = system(buf);
	if (ret < 0) {
		fprintf(stderr, "system clal in do_cleanup: %s\n",
			strerror(errno));
		exit(1);
	}
}


void do_dpkg_method()
{
	const char *dirname = "test_pkg_dir";
	struct resource_track rtrack;
	char buf[1024];
	int i;

	do_setup(dirname);
	init_resource_track(&rtrack);

	for (i=0; pkg_list[i]; i++) {
		sprintf(buf, "%s.dpkg-new", pkg_list[i]);
		write_file(buf, 0, 0);
	}
	for (i=0; pkg_list[i]; i++) {
		sprintf(buf, "%s.dpkg-new", pkg_list[i]);
		sync_file(buf);
		rename(buf, pkg_list[i]);
	}
	print_resource_track("dpkg current", &rtrack);
	do_cleanup(dirname);
}

void do_fixed_method()
{
	const char *dirname = "fixed_pkg_dir";
	struct resource_track rtrack;
	char buf[1024];
	int i;

	do_setup(dirname);
	init_resource_track(&rtrack);

	for (i=0; pkg_list[i]; i++) {
		sprintf(buf, "%s.dpkg-new", pkg_list[i]);
		write_file(buf, 0, 1);
	}
	for (i=0; pkg_list[i]; i++) {
		sprintf(buf, "%s.dpkg-new", pkg_list[i]);
		sync_range(buf);
	}
	for (i=0; pkg_list[i]; i++) {
		sprintf(buf, "%s.dpkg-new", pkg_list[i]);
		datasync_file(buf);
	}
	for (i=0; pkg_list[i]; i++) {
		sprintf(buf, "%s.dpkg-new", pkg_list[i]);
		rename(buf, pkg_list[i]);
	}
	print_resource_track("dpkg fixed", &rtrack);
	do_cleanup(dirname);
}

main(int argc, char **argv)
{
	int	opt;
	int	d_flag = 0, n_flag = 0;

	while ((opt = getopt(argc, argv, "dn")) != -1) {
		switch(opt) {
		case 'd':
			d_flag = 1;
			break;
		case 'n':
			n_flag = 1;
			break;
		default:
			fprintf(stderr, "Bad option: %c\n", opt);
			exit(1);
		}
	}
	if (d_flag)
		do_dpkg_method();
	if (n_flag)
		do_fixed_method();
}
#!/bin/sh
cd /sys/kernel/debug/tracing
echo blk > current_tracer
echo 1 > /sys/block/dm-5/trace/enable
echo 1 > events/ext4/ext4_sync_file/enable
echo 1 > events/ext4/ext4_da_writepages/enable
echo 1 > events/ext4/ext4_mark_inode_dirty/enable
echo 1 > events/jbd2/jbd2_run_stats/enable
echo 1 > events/jbd2/jbd2_start_commit/enable
echo 1 > events/jbd2/jbd2_end_commit/enable
(cd /kbuild; /home/tytso/src/mass-sync-tester -n)
cat trace > /tmp/trace
echo 0 > events/jbd2/jbd2_start_commit/enable
echo 0 > events/jbd2/jbd2_end_commit/enable
echo 0 > events/jbd2/jbd2_run_stats/enable
echo 0 > events/ext4/ext4_sync_file/enable
echo 0 > events/ext4/ext4_da_writepages/enable
echo 0 > events/ext4/ext4_mark_inode_dirty/enable
echo 0 > /sys/block/dm-5/trace/enable
echo nop > current_tracer

Reply to: