r6028 - in glibc-package/trunk/debian: . patches patches/arm
Author: adconrad
Date: 2014-04-28 02:42:40 +0000 (Mon, 28 Apr 2014)
New Revision: 6028
Added:
glibc-package/trunk/debian/patches/arm/cvs-arm-always-blx.diff
glibc-package/trunk/debian/patches/arm/cvs-memcpy-align.diff
Modified:
glibc-package/trunk/debian/changelog
glibc-package/trunk/debian/patches/series
Log:
debian/patches/arm/cvs-{memcpy-align.patch,arm-always-blx.diff}: Backport
ifunc memcpy routines from 2.19 to fix alignment issues and computed-jump
calculations for ARM_ALWAYS_BX, should fix memcpy on our ArmadaXP buildds.
Modified: glibc-package/trunk/debian/changelog
===================================================================
--- glibc-package/trunk/debian/changelog 2014-04-27 20:47:05 UTC (rev 6027)
+++ glibc-package/trunk/debian/changelog 2014-04-28 02:42:40 UTC (rev 6028)
@@ -25,6 +25,11 @@
* kfreebsd/local-sysdeps.diff: update to revision 5460 (from glibc-bsd).
* kfreebsd/local-fbtl.diff: likewise
+ [ Adam Conrad ]
+ * debian/patches/arm/cvs-{memcpy-align.patch,arm-always-blx.diff}: Backport
+ ifunc memcpy routines from 2.19 to fix alignment issues and computed-jump
+ calculations for ARM_ALWAYS_BX, should fix memcpy on our ArmadaXP buildds.
+
-- Aurelien Jarno <aurel32@debian.org> Sun, 02 Mar 2014 16:19:49 +0100
eglibc (2.18-4) unstable; urgency=high
Added: glibc-package/trunk/debian/patches/arm/cvs-arm-always-blx.diff
===================================================================
--- glibc-package/trunk/debian/patches/arm/cvs-arm-always-blx.diff (rev 0)
+++ glibc-package/trunk/debian/patches/arm/cvs-arm-always-blx.diff 2014-04-28 02:42:40 UTC (rev 6028)
@@ -0,0 +1,74 @@
+commit 068dcfd6758b2f50445d40cfe9d10e4284bd0635
+Author: Roland McGrath <roland@hack.frob.com>
+Date: Fri Nov 22 11:39:20 2013 -0800
+
+ ARM: Fix memcpy computed-jump calculations for ARM_ALWAYS_BX case.
+
+2013-11-22 Roland McGrath <roland@hack.frob.com>
+
+ * sysdeps/arm/armv7/multiarch/memcpy_impl.S
+ [ARM_ALWAYS_BX] (dispatch_helper): Fix PC computation to properly
+ account for instructions after the reference to PC given that 'bx'
+ might actually be expanded to multiple instructions.
+ * sysdeps/arm/arm-features.h (ARM_BX_NINSNS): Macro removed.
+
+diff --git a/ports/sysdeps/arm/arm-features.h b/ports/sysdeps/arm/arm-features.h
+index 1d4b0f1..336b690 100644
+--- a/ports/sysdeps/arm/arm-features.h
++++ b/ports/sysdeps/arm/arm-features.h
+@@ -53,14 +53,6 @@
+ # define ARM_BX_ALIGN_LOG2 2
+ #endif
+
+-/* The number of instructions that 'bx' expands to. A more-specific
+- arm-features.h that defines 'bx' as a macro should define this to the
+- number instructions it expands to. This is used only in a context
+- where the 'bx' expansion won't cross an ARM_BX_ALIGN_LOG2 boundary. */
+-#ifndef ARM_BX_NINSNS
+-# define ARM_BX_NINSNS 1
+-#endif
+-
+ /* An OS-specific arm-features.h file may define ARM_NO_INDEX_REGISTER to
+ indicate that the two-register addressing modes must never be used. */
+
+diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+index ad43a3d..44cecb0 100644
+--- a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
++++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+@@ -127,25 +127,26 @@
+ .purgem dispatch_step
+ .endm
+ #else
+-# if ARM_BX_ALIGN_LOG2 < 4
++# if ARM_BX_ALIGN_LOG2 < 3
+ # error case not handled
+ # endif
+ .macro dispatch_helper steps, log2_bytes_per_step
+- .p2align ARM_BX_ALIGN_LOG2
+ /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
+ (STEPS << LOG2_BYTES_PER_STEP).
+- So this is (steps_to_skip << LOG2_BYTES_PER_STEP). */
+- rsb tmp1, tmp1, #(\steps << \log2_bytes_per_step)
+- /* Pad so that the add;bx pair immediately precedes an alignment
+- boundary. Hence, TMP1=0 will run all the steps. */
+- .rept (1 << (ARM_BX_ALIGN_LOG2 - 2)) - (2 + ARM_BX_NINSNS)
+- nop
+- .endr
++ So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
++ Then it needs further adjustment to compensate for the
++ distance between the PC value taken below (0f + PC_OFS)
++ and the first step's instructions (1f). */
++ rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
++ + ((1f - PC_OFS - 0f) \
++ >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
+ /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
+ steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
+ the (byte) distance to add to the PC. */
+- add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
++0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
+ bx tmp1
++ .p2align ARM_BX_ALIGN_LOG2
++1:
+ .endm
+
+ .macro dispatch_7_dword
Added: glibc-package/trunk/debian/patches/arm/cvs-memcpy-align.diff
===================================================================
--- glibc-package/trunk/debian/patches/arm/cvs-memcpy-align.diff (rev 0)
+++ glibc-package/trunk/debian/patches/arm/cvs-memcpy-align.diff 2014-04-28 02:42:40 UTC (rev 6028)
@@ -0,0 +1,67 @@
+commit cd90698b541046c22544c2c057a4676368fd1d7f
+Author: Will Newton <will.newton@linaro.org>
+Date: Wed Aug 7 14:15:52 2013 +0100
+
+ ARM: Improve armv7 memcpy performance.
+
+ Only enter the aligned copy loop with buffers that can be 8-byte
+ aligned. This improves performance slightly on Cortex-A9 and
+ Cortex-A15 cores for large copies with buffers that are 4-byte
+ aligned but not 8-byte aligned.
+
+ ports/ChangeLog.arm:
+
+ 2013-09-16 Will Newton <will.newton@linaro.org>
+
+ * sysdeps/arm/armv7/multiarch/memcpy_impl.S: Tighten check
+ on entry to aligned copy loop to improve performance.
+
+2013-09-16 Will Newton <will.newton@linaro.org>
+
+ * sysdeps/arm/armv7/multiarch/memcpy_impl.S: Tighten check
+ on entry to aligned copy loop to improve performance.
+
+diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+index 3decad6..ad43a3d 100644
+--- a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
++++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+@@ -24,7 +24,6 @@
+ ARMv6 (ARMv7-a if using Neon)
+ ARM state
+ Unaligned accesses
+- LDRD/STRD support unaligned word accesses
+
+ */
+
+@@ -369,8 +368,8 @@ ENTRY(memcpy)
+ cfi_adjust_cfa_offset (FRAME_SIZE)
+ cfi_rel_offset (tmp2, 0)
+ cfi_remember_state
+- and tmp2, src, #3
+- and tmp1, dst, #3
++ and tmp2, src, #7
++ and tmp1, dst, #7
+ cmp tmp1, tmp2
+ bne .Lcpy_notaligned
+
+@@ -381,9 +380,9 @@ ENTRY(memcpy)
+ vmov.f32 s0, s0
+ #endif
+
+- /* SRC and DST have the same mutual 32-bit alignment, but we may
++ /* SRC and DST have the same mutual 64-bit alignment, but we may
+ still need to pre-copy some bytes to get to natural alignment.
+- We bring DST into full 64-bit alignment. */
++ We bring SRC and DST into full 64-bit alignment. */
+ lsls tmp2, dst, #29
+ beq 1f
+ rsbs tmp2, tmp2, #0
+@@ -515,7 +514,7 @@ ENTRY(memcpy)
+
+ .Ltail63aligned: /* Count in tmp2. */
+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
+- we know that the src and dest are 32-bit aligned so we can use
++ we know that the src and dest are 64-bit aligned so we can use
+ LDRD/STRD to improve efficiency. */
+ /* TMP2 is now negative, but we don't care about that. The bottom
+ six bits still tell us how many bytes are left to copy. */
Modified: glibc-package/trunk/debian/patches/series
===================================================================
--- glibc-package/trunk/debian/patches/series 2014-04-27 20:47:05 UTC (rev 6027)
+++ glibc-package/trunk/debian/patches/series 2014-04-28 02:42:40 UTC (rev 6028)
@@ -60,6 +60,8 @@
arm/cvs-arm__longjmp-thumb.diff
arm/cvs-arm__sigsetjmp-thumb.diff
arm/cvs-arm-pointer-mangle-frame.diff
+arm/cvs-arm-always-blx.diff
+arm/cvs-memcpy-align.diff
arm64/cvs-arm64-sigcontext.diff
arm64/cvs-arm64-relocs.diff
Reply to: