[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: libc-i686



Hi Jeff,

> > > > Any chance on a libc-p4?
> > > Any additional optimisations need to have benchmarks proving that
> > > they're worth the hassle.  Every one that we add carries the cost of
> > > additional troubleshooting, etc.
> > > Do you have data that suggests that there's a notable improvement tobe
> > > had?
> > For memcpy I found an 8,4% speed improvement when copying 100MB using
> > all 4 execution pipelines instead of 2 currently used.
> 
> Sorry about the lag, I'm catching up on emails from when I was away.  Is
> that 8,4% over libc6-i686 or over the regular libc6?

I did not use libc6 but pulled out the 'WORD_COPY_FWD' out of glibc.
See attached source.


Folkert van Heusden

-- 
Temperature outside:                                    Forbidden   You don't have permission to access /temp.php on this server., temperature livingroom: 24.7
----------------------------------------------------------------------
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com
#include <sys/time.h>
#include <sys/resource.h>

#define org_WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)              \
  do                                                                    \
    {                                                                   \
      asm volatile ("subl       $32,%2\n"                               \
                    "js         2f\n"                                   \
                    "movl       0(%0),%%edx\n"  /* alloc dest line */   \
                    "1:\n"                                              \
                    "movl       28(%0),%%eax\n" /* alloc dest line */   \
                    "subl       $32,%2\n"       /* decr loop count */   \
                    "movl       0(%1),%%eax\n"  /* U pipe */            \
                    "movl       4(%1),%%edx\n"  /* V pipe */            \
                    "movl       %%eax,0(%0)\n"  /* U pipe */            \
                    "movl       %%edx,4(%0)\n"  /* V pipe */            \
                    "movl       8(%1),%%eax\n"                          \
                    "movl       12(%1),%%edx\n"                         \
                    "movl       %%eax,8(%0)\n"                          \
                    "movl       %%edx,12(%0)\n"                         \
                    "movl       16(%1),%%eax\n"                         \
                    "movl       20(%1),%%edx\n"                         \
                    "movl       %%eax,16(%0)\n"                         \
                    "movl       %%edx,20(%0)\n"                         \
                    "movl       24(%1),%%eax\n"                         \
                    "movl       28(%1),%%edx\n"                         \
                    "movl       %%eax,24(%0)\n"                         \
                    "movl       %%edx,28(%0)\n"                         \
                    "leal       32(%1),%1\n"    /* update src ptr */    \
                    "leal       32(%0),%0\n"    /* update dst ptr */    \
                    "jns        1b\n"                                   \
                    "2: addl    $32,%2" :                               \
                    "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :  \
                    "0" (dst_bp), "1" (src_bp), "2" (nbytes) :          \
                    "ax", "dx");                                        \
    } while (0)

#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)              \
  do                                                                    \
    {                                                                   \
      asm volatile ("subl       $32,%2\n"                               \
                    "js         2f\n"                                   \
                    "movl       0(%0),%%edx\n"  /* alloc dest line */   \
                    "1:\n"                                              \
                    "movl       28(%0),%%eax\n" /* alloc dest line */   \
                    "subl       $32,%2\n"       /* decr loop count */   \
                    "movl       0(%1),%%eax\n"  /* U pipe */            \
                    "movl       4(%1),%%edx\n"  /* V pipe */            \
                    "movl       8(%1),%%ebx\n"              \
                    "movl       12(%1),%%ecx\n"              \
                    "movl       %%eax,0(%0)\n"              \
                    "movl       %%edx,4(%0)\n"              \
                    "movl       %%ebx,8(%0)\n"              \
                    "movl       %%ecx,12(%0)\n"              \
                    "movl       16(%1),%%eax\n"                         \
                    "movl       20(%1),%%edx\n"                         \
                    "movl       24(%1),%%ecx\n"                         \
                    "movl       28(%1),%%ebx\n"                         \
                    "movl       %%eax,16(%0)\n"                         \
                    "movl       %%edx,20(%0)\n"                         \
                    "movl       %%ecx,24(%0)\n"                         \
                    "movl       %%ebx,28(%0)\n"                         \
                    "leal       32(%1),%1\n"    /* update src ptr */    \
                    "leal       32(%0),%0\n"    /* update dst ptr */    \
                    "jns        1b\n"                                   \
                    "2: addl    $32,%2" :                               \
                    "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :  \
                    "0" (dst_bp), "1" (src_bp), "2" (nbytes) :          \
                    "ax", "dx");                                        \
    } while (0)

void my_memcpy(void *dest, const void *src, size_t n)
{
	__asm__("	cld			\n"
		"	shrl	$1, %%ecx	\n"
		"	jnc	1f		\n"
		"	movsb			\n"
		"1:	shrl	$1, %%ecx	\n"
		"	jnc	2f		\n"
		"	movsw			\n"
		"2:	rep			\n"
		"	movsl			\n"
		: /* no output 			*/
		: "D" (dest), "S" (src), "c" (n)
		: "memory");
}

#define BYTES (250 * 1024 * 1024)
#define N 10

double get_cpu_usage(void)
{
        struct rusage usage;

        if (getrusage(RUSAGE_SELF, &usage) == -1)
                printf("getrusage error\n");

        return (double)usage.ru_utime.tv_sec + (double)usage.ru_utime.tv_usec / 1000000.0 + (double)usage.ru_stime.tv_sec + (double)usage.ru_stime.tv_usec / 1000000.0;
}


int main(int argc, char *argv[])
{
	char *pp1 = (char *)malloc(BYTES), *pp2 = (char *)malloc(BYTES);
	char *p2, *p1;
	unsigned int nbytes = BYTES;
	unsigned int loop;
	double start;

	start = get_cpu_usage();
	for(loop=0; loop<N; loop++)
	{
		p2 = pp2;
		p1 = pp1;
		nbytes = BYTES;
		org_WORD_COPY_FWD(p2, p1, nbytes, nbytes);
	}
	printf("%f\n", get_cpu_usage() - start);

	nbytes = BYTES;
	start = get_cpu_usage();
	for(loop=0; loop<N; loop++)
	{
		p2 = pp2;
		p1 = pp1;
		nbytes = BYTES;
		WORD_COPY_FWD(p2, p1, nbytes, nbytes);
	}
	printf("%f\n", get_cpu_usage() - start);

	nbytes = BYTES;
	start = get_cpu_usage();
	for(loop=0; loop<N; loop++)
	{
		p2 = pp2;
		p1 = pp1;
		nbytes = BYTES;
		my_memcpy(p2, p1, nbytes);
	}
	printf("%f\n", get_cpu_usage() - start);

	return 0;
}

Reply to: