[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Help on memchr() EGLIBC assembly code



On 07/30/2009 09:29 AM, Aurelien Jarno wrote:
Thanks for this patch I have tried it, and it does not have the original
problem I have reported. Unfortunately it does not pass the glibc
testsuite. I'll try to debug the problem later (I don't own an alpha
machine, and need to have internet access to debug it remotely).


This one passes the test suite (on x86, with replacements for the alpha builtins).


r~
typedef unsigned long word;

#ifdef __alpha__
#define cmpbeq0(X)	__builtin_alpha_cmpbge(0, (X))
#define extql		__builtin_alpha_extql
#define extqh		__builtin_alpha_extqh
#define insbl		__builtin_alpha_insbl
#define zap		__builtin_alpha_zap
#else
static word
cmpbeq0(word y)
{
  word i, r = 0;
  for (i = 0; i < 8; ++i)
    {
      unsigned char yc = (y >> i*8);
      if (yc == 0)
	r |= 1 << i;
    }
  return r;
}

static word
extql(word x, word y)
{
  return x >> ((y & 7) * 8);
}

static word
extqh(word x, word y)
{
  return x << (64 - ((y & 7) * 8));
}

static word
zap(word x, word y)
{
  word i, mask = 0;
  for (i = 0; i < 8; ++i)
    if (y & (1 << i))
      mask |= (word)0xff << (i * 8);
  return x & ~mask;
}

static word
insbl(word x, word y)
{
  word byte_mask = 1ul << (y & 7);
  word byte_loc = (y & 7) * 8;
  return zap (x << byte_loc, ~byte_mask);
}
#endif

static inline word
ldq_u(word s)
{
  return *(const word *)(s & -8);
}

#define unlikely(X)	__builtin_expect ((X), 0)
#define prefetch(X)	__builtin_prefetch ((void *)(X), 0)

#define find(X, Y)	cmpbeq0 ((X) ^ (Y))

void *
memchr (const void *xs, int xc, word n)
{
  const word *s_align;
  const word s = (word) xs;
  word t, current, found;

  if (unlikely (n == 0))
    return 0;

  current = ldq_u (s);

  /* Replicate low byte of XC into all bytes of C.  */
  t = insbl (xc, 1);			/* 000000c0 */
  t = (xc & 0xff) | t;			/* 000000cc */
  t = (t << 16) | t;			/* 0000cccc */
  const word c = (t << 32) | t;		/* cccccccc */

  /* When N <= 8, we can perform the entire comparison in one go.
     Load the N bytes into the low part of CURRENT, via an unaligned
     quadword load sequence, and treat it as the last aligned word read.  */
  if (unlikely (n <= 8))
    {
      /* Tweak the standard unaligned quadword load sequence by issuing
	 the second ldq_u at (s + n - 1) instead of (s + 8 - 1).  This
	 avoids crossing a page boundary when S+N doesn't.  */
      word last = extqh (ldq_u (s + n - 1), s);
      word first = extql (current, s);
      current = first | last;
      s_align = (const word *) s;
      goto last_quad;
    }

  /* Align the source, and decrement the count by the number
     of bytes searched in the first word.  */
  s_align = (const word *)(s & -8);
  n += (s & 7);

  /* Deal with misalignment in the first word for the comparison.  */
  {
    word mask = -1ul << (s & 7);
    found = find (current, c) & mask;
    if (unlikely (found))
      goto found_it;
  }

  s_align++;
  n -= 8;

  /* If the block is sufficiently large, align to cacheline and prefetch.  */
  if (unlikely (n >= 256))
    {
      /* Prefetch 3 cache lines beyond the one we're working on.  */
      prefetch (s_align + 8);
      prefetch (s_align + 16);
      prefetch (s_align + 24);

      while ((word)s_align & 63)
	{
	  current = *s_align;
	  found = find (current, c);
	  if (found)
	    goto found_it;
	  s_align++;
	  n -= 8;
	}

	/* Within each cacheline, advance the load for the next word
	   before the test for the previous word is complete.  This
	   allows us to hide the 3 cycle L1 cache load latency.  We
	   only perform this advance load within a cacheline to prevent
	   reading across page boundary.  */
#define CACHELINE_LOOP				\
	do {					\
	  word i, next = s_align[0];		\
	  for (i = 0; i < 7; ++i)		\
	    {					\
	      current = next;			\
	      next = s_align[1];		\
	      found = find (current, c);	\
	      if (unlikely (found))		\
		goto found_it;			\
	      s_align++;			\
	    }					\
	  current = next;			\
	  found = find (current, c);		\
	  if (unlikely (found))			\
	    goto found_it;			\
	  s_align++;				\
	  n -= 64;				\
	} while (0)
      
      /* While there's still lots more data to potentially be read,
	 continue issuing prefetches for the 4th cacheline out.  */
      while (n >= 256)
	{
	  prefetch (s_align + 24);
	  CACHELINE_LOOP;
	}

      /* Up to 3 cache lines remaining.  Continue issuing advanced
	 loads, but stop prefetching.  */
      while (n >= 64)
	CACHELINE_LOOP;
    }

  /* Quadword aligned loop.  */
  current = *s_align;
  while (n > 8)
    {
      found = find (current, c);
      if (unlikely (found))
	goto found_it;
      current = *++s_align;
      n -= 8;
    }

 last_quad:
  {
    word mask = -1ul << n;
    found = find (current, c) & ~mask;
    if (found == 0)
      return 0;
  }

 found_it:
  {
    word offset;

#ifdef __alpha_cix__
    offset = __builtin_alpha_cttz (found);
#else
    /* Extract LSB.  */
    found &= -found;

    /* Binary search for the LSB.  */
    offset  = (found & 0x0f ? 0 : 4);
    offset += (found & 0x33 ? 0 : 2);
    offset += (found & 0x55 ? 0 : 1);
#endif

    return (void *)((word)s_align + offset);
  }
}

Reply to: