[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Help on memchr() EGLIBC assembly code



On 07/26/2009 04:45 PM, Aurelien Jarno wrote:
Knowing that $31 could be used for prefetch, I have modified the
assembly code from memchr.S to use it. It passes all the testsuite.


This isn't intended to be a prefetch instruction, it's
meant to be fetching the data for the next word.  I.e.
we've unrolled the loop and there's at least 8 bytes
left in the search.

Note the

        # At least two quads remain to be accessed.

comment.  At that point we're supposed to be more
than 16 bytes away from the end of the input buffer.

Actually, the confusion I see is farther upthread:

> >>>>> The problem is that the memchr() function on alpha uses prefetch, which > >>>>> can cause a page boundary to be crossed, while the standards (POSIX and
> >>>>> C99) says it should stop when a match is found.

I didn't realize this when I wrote the function.

The entire function should be rewritten, since there's little
point in using a prefetch instruction that close to the load.
Prefetch instructions should be used to move data into the L1
cache, not hide the 3 cycle load delay.  Thus a prefetch, if
used, should be several cache lines ahead, not just a single word.

Perhaps a better solution would be to read words until we get
cacheline aligned, then read the entire line into 8 registers,
prefetch the next line, then process each register one by one.

Try this.


r~
typedef unsigned long word;

#define ldq(X)		(*(const word *)(X))
#define ldq_u(X)	(*(const word *)((X) & -8))

#define cmpbge		__builtin_alpha_cmpbge
#define extql		__builtin_alpha_extql
#define extqh		__builtin_alpha_extqh
#define insbl		__builtin_alpha_insbl
#define insqh		__builtin_alpha_insqh
#define zap		__builtin_alpha_zap

#define unlikely(X)	__builtin_expect ((X), 0)
#define prefetch(X)	__builtin_prefetch ((void *)(X), 0)

#define find(X, Y)	cmpbge (0, (X) ^ (Y))

void *memchr (const void *xs, int xc, word n)
{
  word s = (word)xs;
  word c;
  word current, found;
  word s_align;

  if (unlikely (n == 0))
    return 0;

  current = ldq_u (s);

  /* Replicate low byte of C into all bytes.  */
  {
    word t = insbl (xc, 1);		/* 000000c0 */
    c = (xc & 0xff) | t;		/* 000000cc */
    c = (c << 16) | c;			/* 0000cccc */
    c = (c << 32) | c;			/* cccccccc */
  }

  if (unlikely (n < 9))
    goto only_quad;

  /* Align the source, and decrement the count by the number
     of bytes searched in the first word.  */
  s_align = s & -8;
  n += (s & 7);
  n -= 8;

  /* Deal with misalignment in the first word for the comparison.  */
  {
    word mask = insqh (-1, s);
    found = cmpbge (0, (current ^ c) | mask);
    if (found)
      goto found_it;
  }

  s_align += 8;

  /* If the block is sufficiently large, align to cacheline (minimum 64-bytes),
     prefetch the next line, and read entire cachelines at a time.  */
  if (unlikely (n >= 256))
    {
      prefetch (s_align + 64);
      while (s_align & 63)
	{
	  current = ldq (s_align);
	  found = find (current, c);
	  if (found)
	    goto found_it;
	  s_align += 8;
	  n -= 8;
	}

      while (n > 64)
	{
	  word c0, c1, c2, c3, c4, c5, c6, c7;

	  prefetch (s_align + 64);
	  c0 = ldq (s_align + 0*8);
	  c1 = ldq (s_align + 1*8);
	  c2 = ldq (s_align + 2*8);
	  c3 = ldq (s_align + 3*8);
	  c4 = ldq (s_align + 4*8);
	  c5 = ldq (s_align + 5*8);
	  c6 = ldq (s_align + 6*8);
	  c7 = ldq (s_align + 7*8);

	  found = find (c0, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c1, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c2, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c3, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c4, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c5, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c6, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c7, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;
	  n -= 64;
	}
    }

  /* Quadword aligned loop.  */
  while (1)
    {
      current = ldq (s_align);
      if (n < 8)
	goto last_quad;
      found = find (current, c);
      if (found)
	goto found_it;

      s_align += 8;
      n -= 8;
    }

 only_quad:
  {
    word end = zap (n, 0x80) - 1;
    word last = extqh (ldq_u (s + end), s);
    word first = extql (current, s);
    current = first | last;

    /* We've read one word and aligned it in the register.  Thus the 
       bit offset in current is relative to S.  */
    s_align = s;
  }

 last_quad:
  {
    word mask = (-1ul >> -n); 
    found = find (current, c) & mask;
    if (found == 0)
      return 0;
  }

 found_it:
  {
    word offset;

#ifdef __alpha_cix__
    offset = __builtin_alpha_cttz (found);
#else
    /* Extract LSB.  */
    found &= -found;

    /* Binary search for the LSB.  */
    offset  = (found & 0x0f ? 0 : 4);
    offset += (found & 0x33 ? 0 : 2);
    offset += (found & 0x55 ? 0 : 1);
#endif

    return (void *)(s_align + offset);
  }
}

Reply to: