Re: Help on memchr() EGLIBC assembly code
On 07/30/2009 09:29 AM, Aurelien Jarno wrote:
Thanks for this patch I have tried it, and it does not have the original
problem I have reported. Unfortunately it does not pass the glibc
testsuite. I'll try to debug the problem later (I don't own an alpha
machine, and need to have internet access to debug it remotely).
This one passes the test suite (on x86, with replacements for the alpha
builtins).
r~
typedef unsigned long word;
#ifdef __alpha__
#define cmpbeq0(X) __builtin_alpha_cmpbge(0, (X))
#define extql __builtin_alpha_extql
#define extqh __builtin_alpha_extqh
#define insbl __builtin_alpha_insbl
#define zap __builtin_alpha_zap
#else
static word
cmpbeq0(word y)
{
word i, r = 0;
for (i = 0; i < 8; ++i)
{
unsigned char yc = (y >> i*8);
if (yc == 0)
r |= 1 << i;
}
return r;
}
static word
extql(word x, word y)
{
return x >> ((y & 7) * 8);
}
static word
extqh(word x, word y)
{
return x << (64 - ((y & 7) * 8));
}
static word
zap(word x, word y)
{
word i, mask = 0;
for (i = 0; i < 8; ++i)
if (y & (1 << i))
mask |= (word)0xff << (i * 8);
return x & ~mask;
}
static word
insbl(word x, word y)
{
word byte_mask = 1ul << (y & 7);
word byte_loc = (y & 7) * 8;
return zap (x << byte_loc, ~byte_mask);
}
#endif
static inline word
ldq_u(word s)
{
return *(const word *)(s & -8);
}
#define unlikely(X) __builtin_expect ((X), 0)
#define prefetch(X) __builtin_prefetch ((void *)(X), 0)
#define find(X, Y) cmpbeq0 ((X) ^ (Y))
void *
memchr (const void *xs, int xc, word n)
{
const word *s_align;
const word s = (word) xs;
word t, current, found;
if (unlikely (n == 0))
return 0;
current = ldq_u (s);
/* Replicate low byte of XC into all bytes of C. */
t = insbl (xc, 1); /* 000000c0 */
t = (xc & 0xff) | t; /* 000000cc */
t = (t << 16) | t; /* 0000cccc */
const word c = (t << 32) | t; /* cccccccc */
/* When N <= 8, we can perform the entire comparison in one go.
Load the N bytes into the low part of CURRENT, via an unaligned
quadword load sequence, and treat it as the last aligned word read. */
if (unlikely (n <= 8))
{
/* Tweak the standard unaligned quadword load sequence by issuing
the second ldq_u at (s + n - 1) instead of (s + 8 - 1). This
avoids crossing a page boundary when S+N doesn't. */
word last = extqh (ldq_u (s + n - 1), s);
word first = extql (current, s);
current = first | last;
s_align = (const word *) s;
goto last_quad;
}
/* Align the source, and decrement the count by the number
of bytes searched in the first word. */
s_align = (const word *)(s & -8);
n += (s & 7);
/* Deal with misalignment in the first word for the comparison. */
{
word mask = -1ul << (s & 7);
found = find (current, c) & mask;
if (unlikely (found))
goto found_it;
}
s_align++;
n -= 8;
/* If the block is sufficiently large, align to cacheline and prefetch. */
if (unlikely (n >= 256))
{
/* Prefetch 3 cache lines beyond the one we're working on. */
prefetch (s_align + 8);
prefetch (s_align + 16);
prefetch (s_align + 24);
while ((word)s_align & 63)
{
current = *s_align;
found = find (current, c);
if (found)
goto found_it;
s_align++;
n -= 8;
}
/* Within each cacheline, advance the load for the next word
before the test for the previous word is complete. This
allows us to hide the 3 cycle L1 cache load latency. We
only perform this advance load within a cacheline to prevent
reading across page boundary. */
#define CACHELINE_LOOP \
do { \
word i, next = s_align[0]; \
for (i = 0; i < 7; ++i) \
{ \
current = next; \
next = s_align[1]; \
found = find (current, c); \
if (unlikely (found)) \
goto found_it; \
s_align++; \
} \
current = next; \
found = find (current, c); \
if (unlikely (found)) \
goto found_it; \
s_align++; \
n -= 64; \
} while (0)
/* While there's still lots more data to potentially be read,
continue issuing prefetches for the 4th cacheline out. */
while (n >= 256)
{
prefetch (s_align + 24);
CACHELINE_LOOP;
}
/* Up to 3 cache lines remaining. Continue issuing advanced
loads, but stop prefetching. */
while (n >= 64)
CACHELINE_LOOP;
}
/* Quadword aligned loop. */
current = *s_align;
while (n > 8)
{
found = find (current, c);
if (unlikely (found))
goto found_it;
current = *++s_align;
n -= 8;
}
last_quad:
{
word mask = -1ul << n;
found = find (current, c) & ~mask;
if (found == 0)
return 0;
}
found_it:
{
word offset;
#ifdef __alpha_cix__
offset = __builtin_alpha_cttz (found);
#else
/* Extract LSB. */
found &= -found;
/* Binary search for the LSB. */
offset = (found & 0x0f ? 0 : 4);
offset += (found & 0x33 ? 0 : 2);
offset += (found & 0x55 ? 0 : 1);
#endif
return (void *)((word)s_align + offset);
}
}
Reply to: