[glibc] 01/01: debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp for AArch64, improving performance from 25% to 500% (LP: #1720832)
This is an automated email from the git hooks/post-receive script.
adconrad pushed a commit to branch glibc-2.26
in repository glibc.
commit d4f6d3805e95af3f6aaf47d16ed6eac7783391f4
Author: Adam Conrad <adconrad@0c3.net>
Date: Wed Oct 11 14:08:40 2017 -0600
debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp for AArch64, improving performance from 25% to 500% (LP: #1720832)
---
debian/changelog | 2 +
debian/patches/arm/git-arm64-memcmp.diff | 232 +++++++++++++++++++++++++++++++
debian/patches/series | 1 +
3 files changed, 235 insertions(+)
diff --git a/debian/changelog b/debian/changelog
index 1356817..8e797cb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -30,6 +30,8 @@ glibc (2.26-0experimental0) UNRELEASED; urgency=medium
agree with the sorting we see in Debian, may need another look.
- debian/patches/any/local-cudacc-float128.diff: Local patch to prevent
defining __HAVE_FLOAT128 on NVIDIA's CUDA compilers (LP: #1717257)
+ - debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp
+ for AArch64, improving performance from 25% to 500% (LP: #1720832)
- debian/control.in/libc: Drop ancient Breaks satisfied in oldoldstable.
- debian/{debhelper.in/libc.preinst,sysdeps/amd64.mk,sysdeps/i386.mk}:
Bump MIN_KERNEL_SUPPORTED to 3.2 on x86, following upstream's change.
diff --git a/debian/patches/arm/git-arm64-memcmp.diff b/debian/patches/arm/git-arm64-memcmp.diff
new file mode 100644
index 0000000..4b31caf
--- /dev/null
+++ b/debian/patches/arm/git-arm64-memcmp.diff
@@ -0,0 +1,232 @@
+commit 922369032c604b4dcfd535e1bcddd4687e7126a5
+Author: Wilco Dijkstra <wdijkstr@arm.com>
+Date: Thu Aug 10 17:00:38 2017 +0100
+
+ [AArch64] Optimized memcmp.
+
+ This is an optimized memcmp for AArch64. This is a complete rewrite
+ using a different algorithm. The previous version split into cases
+ where both inputs were aligned, the inputs were mutually aligned and
+ unaligned using a byte loop. The new version combines all these cases,
+ while small inputs of less than 8 bytes are handled separately.
+
+ This allows the main code to be sped up using unaligned loads since
+ there are now at least 8 bytes to be compared. After the first 8 bytes,
+ align the first input. This ensures each iteration does at most one
+ unaligned access and mutually aligned inputs behave as aligned.
+ After the main loop, process the last 8 bytes using unaligned accesses.
+
+ This improves performance of (mutually) aligned cases by 25% and
+ unaligned by >500% (yes >6 times faster) on large inputs.
+
+ * sysdeps/aarch64/memcmp.S (memcmp):
+ Rewrite of optimized memcmp.
+
+diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
+index 4cfcb89297..b99c081bba 100644
+--- a/sysdeps/aarch64/memcmp.S
++++ b/sysdeps/aarch64/memcmp.S
+@@ -22,132 +22,98 @@
+
+ /* Assumptions:
+ *
+- * ARMv8-a, AArch64
++ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+ /* Parameters and result. */
+ #define src1 x0
+ #define src2 x1
+ #define limit x2
+-#define result x0
++#define result w0
+
+ /* Internal variables. */
+ #define data1 x3
+ #define data1w w3
+ #define data2 x4
+ #define data2w w4
+-#define has_nul x5
+-#define diff x6
+-#define endloop x7
+-#define tmp1 x8
+-#define tmp2 x9
+-#define tmp3 x10
+-#define pos x11
+-#define limit_wd x12
+-#define mask x13
++#define tmp1 x5
+
+ ENTRY_ALIGN (memcmp, 6)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+- cbz limit, L(ret0)
+- eor tmp1, src1, src2
+- tst tmp1, #7
+- b.ne L(misaligned8)
+- ands tmp1, src1, #7
+- b.ne L(mutual_align)
+- add limit_wd, limit, #7
+- lsr limit_wd, limit_wd, #3
+- /* Start of performance-critical section -- one 64B cache line. */
+-L(loop_aligned):
+- ldr data1, [src1], #8
+- ldr data2, [src2], #8
+-L(start_realigned):
+- subs limit_wd, limit_wd, #1
+- eor diff, data1, data2 /* Non-zero if differences found. */
+- csinv endloop, diff, xzr, ne /* Last Dword or differences. */
+- cbz endloop, L(loop_aligned)
+- /* End of performance-critical section -- one 64B cache line. */
+-
+- /* Not reached the limit, must have found a diff. */
+- cbnz limit_wd, L(not_limit)
+-
+- /* Limit % 8 == 0 => all bytes significant. */
+- ands limit, limit, #7
+- b.eq L(not_limit)
+-
+- lsl limit, limit, #3 /* Bits -> bytes. */
+- mov mask, #~0
+-#ifdef __AARCH64EB__
+- lsr mask, mask, limit
+-#else
+- lsl mask, mask, limit
+-#endif
+- bic data1, data1, mask
+- bic data2, data2, mask
+-
+- orr diff, diff, mask
+-L(not_limit):
+
+-#ifndef __AARCH64EB__
+- rev diff, diff
++ subs limit, limit, 8
++ b.lo .Lless8
++
++ /* Limit >= 8, so check first 8 bytes using unaligned loads. */
++ ldr data1, [src1], 8
++ ldr data2, [src2], 8
++ and tmp1, src1, 7
++ add limit, limit, tmp1
++ cmp data1, data2
++ bne .Lreturn
++
++ /* Align src1 and adjust src2 with bytes not yet done. */
++ sub src1, src1, tmp1
++ sub src2, src2, tmp1
++
++ subs limit, limit, 8
++ b.ls .Llast_bytes
++
++ /* Loop performing 8 bytes per iteration using aligned src1.
++ Limit is pre-decremented by 8 and must be larger than zero.
++ Exit if <= 8 bytes left to do or if the data is not equal. */
++ .p2align 4
++.Lloop8:
++ ldr data1, [src1], 8
++ ldr data2, [src2], 8
++ subs limit, limit, 8
++ ccmp data1, data2, 0, hi /* NZCV = 0b0000. */
++ b.eq .Lloop8
++
++ cmp data1, data2
++ bne .Lreturn
++
++ /* Compare last 1-8 bytes using unaligned access. */
++.Llast_bytes:
++ ldr data1, [src1, limit]
++ ldr data2, [src2, limit]
++
++ /* Compare data bytes and set return value to 0, -1 or 1. */
++.Lreturn:
++#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+ #endif
+- /* The MS-non-zero bit of DIFF marks either the first bit
+- that is different, or the end of the significant data.
+- Shifting left now will bring the critical information into the
+- top bits. */
+- clz pos, diff
+- lsl data1, data1, pos
+- lsl data2, data2, pos
+- /* But we need to zero-extend (char is unsigned) the value and then
+- perform a signed 32-bit subtraction. */
+- lsr data1, data1, #56
+- sub result, data1, data2, lsr #56
+- RET
+-
+-L(mutual_align):
+- /* Sources are mutually aligned, but are not currently at an
+- alignment boundary. Round down the addresses and then mask off
+- the bytes that precede the start point. */
+- bic src1, src1, #7
+- bic src2, src2, #7
+- add limit, limit, tmp1 /* Adjust the limit for the extra. */
+- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+- ldr data1, [src1], #8
+- neg tmp1, tmp1 /* Bits to alignment -64. */
+- ldr data2, [src2], #8
+- mov tmp2, #~0
+-#ifdef __AARCH64EB__
+- /* Big-endian. Early bytes are at MSB. */
+- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+-#else
+- /* Little-endian. Early bytes are at LSB. */
+- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+-#endif
+- add limit_wd, limit, #7
+- orr data1, data1, tmp2
+- orr data2, data2, tmp2
+- lsr limit_wd, limit_wd, #3
+- b L(start_realigned)
+-
+-L(ret0):
+- mov result, #0
+- RET
+-
+- .p2align 6
+-L(misaligned8):
+- sub limit, limit, #1
+-1:
+- /* Perhaps we can do better than this. */
+- ldrb data1w, [src1], #1
+- ldrb data2w, [src2], #1
+- subs limit, limit, #1
+- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+- b.eq 1b
+- sub result, data1, data2
+- RET
++ cmp data1, data2
++.Lret_eq:
++ cset result, ne
++ cneg result, result, lo
++ ret
++
++ .p2align 4
++ /* Compare up to 8 bytes. Limit is [-8..-1]. */
++.Lless8:
++ adds limit, limit, 4
++ b.lo .Lless4
++ ldr data1w, [src1], 4
++ ldr data2w, [src2], 4
++ cmp data1w, data2w
++ b.ne .Lreturn
++ sub limit, limit, 4
++.Lless4:
++ adds limit, limit, 4
++ beq .Lret_eq
++.Lbyte_loop:
++ ldrb data1w, [src1], 1
++ ldrb data2w, [src2], 1
++ subs limit, limit, 1
++ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
++ b.eq .Lbyte_loop
++ sub result, data1w, data2w
++ ret
++
+ END (memcmp)
+ #undef bcmp
+ weak_alias (memcmp, bcmp)
diff --git a/debian/patches/series b/debian/patches/series
index 1548798..060efe9 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -43,6 +43,7 @@ arm/local-soname-hack.diff
arm/local-vfp-sysdeps.diff
arm/unsubmitted-ldso-multilib.diff
arm/local-arm-futex.diff
+arm/git-arm64-memcmp.diff
hppa/local-inlining.diff
hppa/local-elf-make-cflags.diff
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-glibc/glibc.git
Reply to: