Re: libc-i686
Hi Jeff,
> > > > Any chance on a libc-p4?
> > > Any additional optimisations need to have benchmarks proving that
> > > they're worth the hassle. Every one that we add carries the cost of
> > > additional troubleshooting, etc.
> > > Do you have data that suggests that there's a notable improvement tobe
> > > had?
> > For memcpy I found an 8,4% speed improvement when copying 100MB using
> > all 4 execution pipelines instead of 2 currently used.
>
> Sorry about the lag, I'm catching up on emails from when I was away. Is
> that 8,4% over libc6-i686 or over the regular libc6?
I did not use libc6 but pulled out the 'WORD_COPY_FWD' out of glibc.
See attached source.
Folkert van Heusden
--
Temperature outside: Forbidden You don't have permission to access /temp.php on this server., temperature livingroom: 24.7
----------------------------------------------------------------------
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com
#include <sys/time.h>
#include <sys/resource.h>
#define org_WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
do \
{ \
asm volatile ("subl $32,%2\n" \
"js 2f\n" \
"movl 0(%0),%%edx\n" /* alloc dest line */ \
"1:\n" \
"movl 28(%0),%%eax\n" /* alloc dest line */ \
"subl $32,%2\n" /* decr loop count */ \
"movl 0(%1),%%eax\n" /* U pipe */ \
"movl 4(%1),%%edx\n" /* V pipe */ \
"movl %%eax,0(%0)\n" /* U pipe */ \
"movl %%edx,4(%0)\n" /* V pipe */ \
"movl 8(%1),%%eax\n" \
"movl 12(%1),%%edx\n" \
"movl %%eax,8(%0)\n" \
"movl %%edx,12(%0)\n" \
"movl 16(%1),%%eax\n" \
"movl 20(%1),%%edx\n" \
"movl %%eax,16(%0)\n" \
"movl %%edx,20(%0)\n" \
"movl 24(%1),%%eax\n" \
"movl 28(%1),%%edx\n" \
"movl %%eax,24(%0)\n" \
"movl %%edx,28(%0)\n" \
"leal 32(%1),%1\n" /* update src ptr */ \
"leal 32(%0),%0\n" /* update dst ptr */ \
"jns 1b\n" \
"2: addl $32,%2" : \
"=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \
"0" (dst_bp), "1" (src_bp), "2" (nbytes) : \
"ax", "dx"); \
} while (0)
#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
do \
{ \
asm volatile ("subl $32,%2\n" \
"js 2f\n" \
"movl 0(%0),%%edx\n" /* alloc dest line */ \
"1:\n" \
"movl 28(%0),%%eax\n" /* alloc dest line */ \
"subl $32,%2\n" /* decr loop count */ \
"movl 0(%1),%%eax\n" /* U pipe */ \
"movl 4(%1),%%edx\n" /* V pipe */ \
"movl 8(%1),%%ebx\n" \
"movl 12(%1),%%ecx\n" \
"movl %%eax,0(%0)\n" \
"movl %%edx,4(%0)\n" \
"movl %%ebx,8(%0)\n" \
"movl %%ecx,12(%0)\n" \
"movl 16(%1),%%eax\n" \
"movl 20(%1),%%edx\n" \
"movl 24(%1),%%ecx\n" \
"movl 28(%1),%%ebx\n" \
"movl %%eax,16(%0)\n" \
"movl %%edx,20(%0)\n" \
"movl %%ecx,24(%0)\n" \
"movl %%ebx,28(%0)\n" \
"leal 32(%1),%1\n" /* update src ptr */ \
"leal 32(%0),%0\n" /* update dst ptr */ \
"jns 1b\n" \
"2: addl $32,%2" : \
"=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \
"0" (dst_bp), "1" (src_bp), "2" (nbytes) : \
"ax", "dx"); \
} while (0)
void my_memcpy(void *dest, const void *src, size_t n)
{
__asm__(" cld \n"
" shrl $1, %%ecx \n"
" jnc 1f \n"
" movsb \n"
"1: shrl $1, %%ecx \n"
" jnc 2f \n"
" movsw \n"
"2: rep \n"
" movsl \n"
: /* no output */
: "D" (dest), "S" (src), "c" (n)
: "memory");
}
#define BYTES (250 * 1024 * 1024)
#define N 10
double get_cpu_usage(void)
{
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage) == -1)
printf("getrusage error\n");
return (double)usage.ru_utime.tv_sec + (double)usage.ru_utime.tv_usec / 1000000.0 + (double)usage.ru_stime.tv_sec + (double)usage.ru_stime.tv_usec / 1000000.0;
}
int main(int argc, char *argv[])
{
char *pp1 = (char *)malloc(BYTES), *pp2 = (char *)malloc(BYTES);
char *p2, *p1;
unsigned int nbytes = BYTES;
unsigned int loop;
double start;
start = get_cpu_usage();
for(loop=0; loop<N; loop++)
{
p2 = pp2;
p1 = pp1;
nbytes = BYTES;
org_WORD_COPY_FWD(p2, p1, nbytes, nbytes);
}
printf("%f\n", get_cpu_usage() - start);
nbytes = BYTES;
start = get_cpu_usage();
for(loop=0; loop<N; loop++)
{
p2 = pp2;
p1 = pp1;
nbytes = BYTES;
WORD_COPY_FWD(p2, p1, nbytes, nbytes);
}
printf("%f\n", get_cpu_usage() - start);
nbytes = BYTES;
start = get_cpu_usage();
for(loop=0; loop<N; loop++)
{
p2 = pp2;
p1 = pp1;
nbytes = BYTES;
my_memcpy(p2, p1, nbytes);
}
printf("%f\n", get_cpu_usage() - start);
return 0;
}
Reply to: