[glibc] 01/01: debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp for AArch64, improving performance from 25% to 500% (LP: #1720832)

To: debian-glibc@lists.debian.org
Subject: [glibc] 01/01: debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp for AArch64, improving performance from 25% to 500% (LP: #1720832)
From: Adam Conrad <adconrad@moszumanska.debian.org>
Date: Wed, 11 Oct 2017 20:13:05 +0000
Message-id: <[🔎] E1e2NNJ-0001aN-T1@moszumanska.debian.org>
Reply-to: Adam Conrad <adconrad@0c3.net>
In-reply-to: <[🔎] 20171011201305.6005.50507@moszumanska.debian.org>
References: <[🔎] 20171011201305.6005.50507@moszumanska.debian.org>

This is an automated email from the git hooks/post-receive script.

adconrad pushed a commit to branch glibc-2.26
in repository glibc.

commit d4f6d3805e95af3f6aaf47d16ed6eac7783391f4
Author: Adam Conrad <adconrad@0c3.net>
Date:   Wed Oct 11 14:08:40 2017 -0600

    debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp for AArch64, improving performance from 25% to 500% (LP: #1720832)
---
 debian/changelog                         |   2 +
 debian/patches/arm/git-arm64-memcmp.diff | 232 +++++++++++++++++++++++++++++++
 debian/patches/series                    |   1 +
 3 files changed, 235 insertions(+)

diff --git a/debian/changelog b/debian/changelog
index 1356817..8e797cb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -30,6 +30,8 @@ glibc (2.26-0experimental0) UNRELEASED; urgency=medium
       agree with the sorting we see in Debian, may need another look.
     - debian/patches/any/local-cudacc-float128.diff: Local patch to prevent
       defining __HAVE_FLOAT128 on NVIDIA's CUDA compilers (LP: #1717257)
+    - debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp
+      for AArch64, improving performance from 25% to 500% (LP: #1720832)
     - debian/control.in/libc: Drop ancient Breaks satisfied in oldoldstable.
     - debian/{debhelper.in/libc.preinst,sysdeps/amd64.mk,sysdeps/i386.mk}:
       Bump MIN_KERNEL_SUPPORTED to 3.2 on x86, following upstream's change.
diff --git a/debian/patches/arm/git-arm64-memcmp.diff b/debian/patches/arm/git-arm64-memcmp.diff
new file mode 100644
index 0000000..4b31caf
--- /dev/null
+++ b/debian/patches/arm/git-arm64-memcmp.diff
@@ -0,0 +1,232 @@
+commit 922369032c604b4dcfd535e1bcddd4687e7126a5
+Author: Wilco Dijkstra <wdijkstr@arm.com>
+Date:   Thu Aug 10 17:00:38 2017 +0100
+
+    [AArch64] Optimized memcmp.
+    
+    This is an optimized memcmp for AArch64.  This is a complete rewrite
+    using a different algorithm.  The previous version split into cases
+    where both inputs were aligned, the inputs were mutually aligned and
+    unaligned using a byte loop.  The new version combines all these cases,
+    while small inputs of less than 8 bytes are handled separately.
+    
+    This allows the main code to be sped up using unaligned loads since
+    there are now at least 8 bytes to be compared.  After the first 8 bytes,
+    align the first input.  This ensures each iteration does at most one
+    unaligned access and mutually aligned inputs behave as aligned.
+    After the main loop, process the last 8 bytes using unaligned accesses.
+    
+    This improves performance of (mutually) aligned cases by 25% and
+    unaligned by >500% (yes >6 times faster) on large inputs.
+    
+            * sysdeps/aarch64/memcmp.S (memcmp):
+            Rewrite of optimized memcmp.
+
+diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
+index 4cfcb89297..b99c081bba 100644
+--- a/sysdeps/aarch64/memcmp.S
++++ b/sysdeps/aarch64/memcmp.S
+@@ -22,132 +22,98 @@
+ 
+ /* Assumptions:
+  *
+- * ARMv8-a, AArch64
++ * ARMv8-a, AArch64, unaligned accesses.
+  */
+ 
+ /* Parameters and result.  */
+ #define src1		x0
+ #define src2		x1
+ #define limit		x2
+-#define result		x0
++#define result		w0
+ 
+ /* Internal variables.  */
+ #define data1		x3
+ #define data1w		w3
+ #define data2		x4
+ #define data2w		w4
+-#define has_nul		x5
+-#define diff		x6
+-#define endloop		x7
+-#define tmp1		x8
+-#define tmp2		x9
+-#define tmp3		x10
+-#define pos		x11
+-#define limit_wd	x12
+-#define mask		x13
++#define tmp1		x5
+ 
+ ENTRY_ALIGN (memcmp, 6)
+ 	DELOUSE (0)
+ 	DELOUSE (1)
+ 	DELOUSE (2)
+-	cbz	limit, L(ret0)
+-	eor	tmp1, src1, src2
+-	tst	tmp1, #7
+-	b.ne	L(misaligned8)
+-	ands	tmp1, src1, #7
+-	b.ne	L(mutual_align)
+-	add	limit_wd, limit, #7
+-	lsr	limit_wd, limit_wd, #3
+-	/* Start of performance-critical section  -- one 64B cache line.  */
+-L(loop_aligned):
+-	ldr	data1, [src1], #8
+-	ldr	data2, [src2], #8
+-L(start_realigned):
+-	subs	limit_wd, limit_wd, #1
+-	eor	diff, data1, data2	/* Non-zero if differences found.  */
+-	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
+-	cbz	endloop, L(loop_aligned)
+-	/* End of performance-critical section  -- one 64B cache line.  */
+-
+-	/* Not reached the limit, must have found a diff.  */
+-	cbnz	limit_wd, L(not_limit)
+-
+-	/* Limit % 8 == 0 => all bytes significant.  */
+-	ands	limit, limit, #7
+-	b.eq	L(not_limit)
+-
+-	lsl	limit, limit, #3	/* Bits -> bytes.  */
+-	mov	mask, #~0
+-#ifdef __AARCH64EB__
+-	lsr	mask, mask, limit
+-#else
+-	lsl	mask, mask, limit
+-#endif
+-	bic	data1, data1, mask
+-	bic	data2, data2, mask
+-
+-	orr	diff, diff, mask
+-L(not_limit):
+ 
+-#ifndef	__AARCH64EB__
+-	rev	diff, diff
++	subs	limit, limit, 8
++	b.lo	.Lless8
++
++	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
++	ldr	data1, [src1], 8
++	ldr	data2, [src2], 8
++	and	tmp1, src1, 7
++	add	limit, limit, tmp1
++	cmp	data1, data2
++	bne	.Lreturn
++
++	/* Align src1 and adjust src2 with bytes not yet done.  */
++	sub	src1, src1, tmp1
++	sub	src2, src2, tmp1
++
++	subs	limit, limit, 8
++	b.ls	.Llast_bytes
++
++	/* Loop performing 8 bytes per iteration using aligned src1.
++	   Limit is pre-decremented by 8 and must be larger than zero.
++	   Exit if <= 8 bytes left to do or if the data is not equal.  */
++	.p2align 4
++.Lloop8:
++	ldr	data1, [src1], 8
++	ldr	data2, [src2], 8
++	subs	limit, limit, 8
++	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
++	b.eq	.Lloop8
++
++	cmp	data1, data2
++	bne	.Lreturn
++
++	/* Compare last 1-8 bytes using unaligned access.  */
++.Llast_bytes:
++	ldr	data1, [src1, limit]
++	ldr	data2, [src2, limit]
++
++	/* Compare data bytes and set return value to 0, -1 or 1.  */
++.Lreturn:
++#ifndef __AARCH64EB__
+ 	rev	data1, data1
+ 	rev	data2, data2
+ #endif
+-	/* The MS-non-zero bit of DIFF marks either the first bit
+-	   that is different, or the end of the significant data.
+-	   Shifting left now will bring the critical information into the
+-	   top bits.  */
+-	clz	pos, diff
+-	lsl	data1, data1, pos
+-	lsl	data2, data2, pos
+-	/* But we need to zero-extend (char is unsigned) the value and then
+-	   perform a signed 32-bit subtraction.  */
+-	lsr	data1, data1, #56
+-	sub	result, data1, data2, lsr #56
+-	RET
+-
+-L(mutual_align):
+-	/* Sources are mutually aligned, but are not currently at an
+-	   alignment boundary.  Round down the addresses and then mask off
+-	   the bytes that precede the start point.  */
+-	bic	src1, src1, #7
+-	bic	src2, src2, #7
+-	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
+-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+-	ldr	data1, [src1], #8
+-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+-	ldr	data2, [src2], #8
+-	mov	tmp2, #~0
+-#ifdef __AARCH64EB__
+-	/* Big-endian.  Early bytes are at MSB.  */
+-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+-#else
+-	/* Little-endian.  Early bytes are at LSB.  */
+-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+-#endif
+-	add	limit_wd, limit, #7
+-	orr	data1, data1, tmp2
+-	orr	data2, data2, tmp2
+-	lsr	limit_wd, limit_wd, #3
+-	b	L(start_realigned)
+-
+-L(ret0):
+-	mov	result, #0
+-	RET
+-
+-	.p2align 6
+-L(misaligned8):
+-	sub	limit, limit, #1
+-1:
+-	/* Perhaps we can do better than this.  */
+-	ldrb	data1w, [src1], #1
+-	ldrb	data2w, [src2], #1
+-	subs	limit, limit, #1
+-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+-	b.eq	1b
+-	sub	result, data1, data2
+-	RET
++	cmp     data1, data2
++.Lret_eq:
++	cset	result, ne
++	cneg	result, result, lo
++	ret
++
++	.p2align 4
++	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
++.Lless8:
++	adds	limit, limit, 4
++	b.lo	.Lless4
++	ldr	data1w, [src1], 4
++	ldr	data2w, [src2], 4
++	cmp	data1w, data2w
++	b.ne	.Lreturn
++	sub	limit, limit, 4
++.Lless4:
++	adds	limit, limit, 4
++	beq	.Lret_eq
++.Lbyte_loop:
++	ldrb	data1w, [src1], 1
++	ldrb	data2w, [src2], 1
++	subs	limit, limit, 1
++	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
++	b.eq	.Lbyte_loop
++	sub	result, data1w, data2w
++	ret
++
+ END (memcmp)
+ #undef bcmp
+ weak_alias (memcmp, bcmp)
diff --git a/debian/patches/series b/debian/patches/series
index 1548798..060efe9 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -43,6 +43,7 @@ arm/local-soname-hack.diff
 arm/local-vfp-sysdeps.diff
 arm/unsubmitted-ldso-multilib.diff
 arm/local-arm-futex.diff
+arm/git-arm64-memcmp.diff
 
 hppa/local-inlining.diff
 hppa/local-elf-make-cflags.diff

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-glibc/glibc.git

Reply to:

References:
- [glibc] branch glibc-2.26 updated (925768d -> d4f6d38)
  - From: Adam Conrad <adconrad@moszumanska.debian.org>

Prev by Date: [glibc] branch glibc-2.26 updated (925768d -> d4f6d38)
Next by Date: [glibc] branch glibc-2.26 updated (d4f6d38 -> a91100e)
Previous by thread: [glibc] branch glibc-2.26 updated (925768d -> d4f6d38)
Next by thread: [glibc] branch glibc-2.26 updated (d4f6d38 -> a91100e)
Index(es):
- Date
- Thread