[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

r6028 - in glibc-package/trunk/debian: . patches patches/arm



Author: adconrad
Date: 2014-04-28 02:42:40 +0000 (Mon, 28 Apr 2014)
New Revision: 6028

Added:
   glibc-package/trunk/debian/patches/arm/cvs-arm-always-blx.diff
   glibc-package/trunk/debian/patches/arm/cvs-memcpy-align.diff
Modified:
   glibc-package/trunk/debian/changelog
   glibc-package/trunk/debian/patches/series
Log:
debian/patches/arm/cvs-{memcpy-align.patch,arm-always-blx.diff}: Backport
ifunc memcpy routines from 2.19 to fix alignment issues and computed-jump
calculations for ARM_ALWAYS_BX, should fix memcpy on our ArmadaXP buildds.

Modified: glibc-package/trunk/debian/changelog
===================================================================
--- glibc-package/trunk/debian/changelog	2014-04-27 20:47:05 UTC (rev 6027)
+++ glibc-package/trunk/debian/changelog	2014-04-28 02:42:40 UTC (rev 6028)
@@ -25,6 +25,11 @@
   * kfreebsd/local-sysdeps.diff: update to revision 5460 (from glibc-bsd).
   * kfreebsd/local-fbtl.diff: likewise
 
+  [ Adam Conrad ]
+  * debian/patches/arm/cvs-{memcpy-align.patch,arm-always-blx.diff}: Backport
+    ifunc memcpy routines from 2.19 to fix alignment issues and computed-jump
+    calculations for ARM_ALWAYS_BX, should fix memcpy on our ArmadaXP buildds.
+
  -- Aurelien Jarno <aurel32@debian.org>  Sun, 02 Mar 2014 16:19:49 +0100
 
 eglibc (2.18-4) unstable; urgency=high

Added: glibc-package/trunk/debian/patches/arm/cvs-arm-always-blx.diff
===================================================================
--- glibc-package/trunk/debian/patches/arm/cvs-arm-always-blx.diff	                        (rev 0)
+++ glibc-package/trunk/debian/patches/arm/cvs-arm-always-blx.diff	2014-04-28 02:42:40 UTC (rev 6028)
@@ -0,0 +1,74 @@
+commit 068dcfd6758b2f50445d40cfe9d10e4284bd0635
+Author: Roland McGrath <roland@hack.frob.com>
+Date:   Fri Nov 22 11:39:20 2013 -0800
+
+    ARM: Fix memcpy computed-jump calculations for ARM_ALWAYS_BX case.
+
+2013-11-22  Roland McGrath  <roland@hack.frob.com>
+
+	* sysdeps/arm/armv7/multiarch/memcpy_impl.S
+	[ARM_ALWAYS_BX] (dispatch_helper): Fix PC computation to properly
+	account for instructions after the reference to PC given that 'bx'
+	might actually be expanded to multiple instructions.
+	* sysdeps/arm/arm-features.h (ARM_BX_NINSNS): Macro removed.
+
+diff --git a/ports/sysdeps/arm/arm-features.h b/ports/sysdeps/arm/arm-features.h
+index 1d4b0f1..336b690 100644
+--- a/ports/sysdeps/arm/arm-features.h
++++ b/ports/sysdeps/arm/arm-features.h
+@@ -53,14 +53,6 @@
+ # define ARM_BX_ALIGN_LOG2	2
+ #endif
+ 
+-/* The number of instructions that 'bx' expands to.  A more-specific
+-   arm-features.h that defines 'bx' as a macro should define this to the
+-   number instructions it expands to.  This is used only in a context
+-   where the 'bx' expansion won't cross an ARM_BX_ALIGN_LOG2 boundary.  */
+-#ifndef ARM_BX_NINSNS
+-# define ARM_BX_NINSNS		1
+-#endif
+-
+ /* An OS-specific arm-features.h file may define ARM_NO_INDEX_REGISTER to
+    indicate that the two-register addressing modes must never be used.  */
+ 
+diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+index ad43a3d..44cecb0 100644
+--- a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
++++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+@@ -127,25 +127,26 @@
+ 	.purgem dispatch_step
+ 	.endm
+ #else
+-# if ARM_BX_ALIGN_LOG2 < 4
++# if ARM_BX_ALIGN_LOG2 < 3
+ #  error case not handled
+ # endif
+ 	.macro dispatch_helper steps, log2_bytes_per_step
+-	.p2align ARM_BX_ALIGN_LOG2
+ 	/* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
+ 	   (STEPS << LOG2_BYTES_PER_STEP).
+-	   So this is (steps_to_skip << LOG2_BYTES_PER_STEP).  */
+-	rsb	tmp1, tmp1, #(\steps << \log2_bytes_per_step)
+-	/* Pad so that the add;bx pair immediately precedes an alignment
+-	   boundary.  Hence, TMP1=0 will run all the steps.  */
+-	.rept (1 << (ARM_BX_ALIGN_LOG2 - 2)) - (2 + ARM_BX_NINSNS)
+-	nop
+-	.endr
++	   So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
++	   Then it needs further adjustment to compensate for the
++	   distance between the PC value taken below (0f + PC_OFS)
++	   and the first step's instructions (1f).  */
++	rsb	tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
++			      + ((1f - PC_OFS - 0f) \
++				 >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
+ 	/* Shifting down LOG2_BYTES_PER_STEP gives us the number of
+ 	   steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
+ 	   the (byte) distance to add to the PC.  */
+-	add	tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
++0:	add	tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
+ 	bx	tmp1
++	.p2align ARM_BX_ALIGN_LOG2
++1:
+ 	.endm
+ 
+ 	.macro dispatch_7_dword

Added: glibc-package/trunk/debian/patches/arm/cvs-memcpy-align.diff
===================================================================
--- glibc-package/trunk/debian/patches/arm/cvs-memcpy-align.diff	                        (rev 0)
+++ glibc-package/trunk/debian/patches/arm/cvs-memcpy-align.diff	2014-04-28 02:42:40 UTC (rev 6028)
@@ -0,0 +1,67 @@
+commit cd90698b541046c22544c2c057a4676368fd1d7f
+Author: Will Newton <will.newton@linaro.org>
+Date:   Wed Aug 7 14:15:52 2013 +0100
+
+    ARM: Improve armv7 memcpy performance.
+    
+    Only enter the aligned copy loop with buffers that can be 8-byte
+    aligned. This improves performance slightly on Cortex-A9 and
+    Cortex-A15 cores for large copies with buffers that are 4-byte
+    aligned but not 8-byte aligned.
+    
+    ports/ChangeLog.arm:
+    
+    2013-09-16  Will Newton  <will.newton@linaro.org>
+    
+    	* sysdeps/arm/armv7/multiarch/memcpy_impl.S: Tighten check
+    	on entry to aligned copy loop to improve performance.
+
+2013-09-16  Will Newton  <will.newton@linaro.org>
+
+	* sysdeps/arm/armv7/multiarch/memcpy_impl.S: Tighten check
+	on entry to aligned copy loop to improve performance.
+
+diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+index 3decad6..ad43a3d 100644
+--- a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
++++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+@@ -24,7 +24,6 @@
+     ARMv6 (ARMv7-a if using Neon)
+     ARM state
+     Unaligned accesses
+-    LDRD/STRD support unaligned word accesses
+ 
+  */
+ 
+@@ -369,8 +368,8 @@ ENTRY(memcpy)
+ 	cfi_adjust_cfa_offset (FRAME_SIZE)
+ 	cfi_rel_offset (tmp2, 0)
+ 	cfi_remember_state
+-	and	tmp2, src, #3
+-	and	tmp1, dst, #3
++	and	tmp2, src, #7
++	and	tmp1, dst, #7
+ 	cmp	tmp1, tmp2
+ 	bne	.Lcpy_notaligned
+ 
+@@ -381,9 +380,9 @@ ENTRY(memcpy)
+ 	vmov.f32	s0, s0
+ #endif
+ 
+-	/* SRC and DST have the same mutual 32-bit alignment, but we may
++	/* SRC and DST have the same mutual 64-bit alignment, but we may
+ 	   still need to pre-copy some bytes to get to natural alignment.
+-	   We bring DST into full 64-bit alignment.  */
++	   We bring SRC and DST into full 64-bit alignment.  */
+ 	lsls	tmp2, dst, #29
+ 	beq	1f
+ 	rsbs	tmp2, tmp2, #0
+@@ -515,7 +514,7 @@ ENTRY(memcpy)
+ 
+ .Ltail63aligned:			/* Count in tmp2.  */
+ 	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
+-	   we know that the src and dest are 32-bit aligned so we can use
++	   we know that the src and dest are 64-bit aligned so we can use
+ 	   LDRD/STRD to improve efficiency.  */
+ 	/* TMP2 is now negative, but we don't care about that.  The bottom
+ 	   six bits still tell us how many bytes are left to copy.  */

Modified: glibc-package/trunk/debian/patches/series
===================================================================
--- glibc-package/trunk/debian/patches/series	2014-04-27 20:47:05 UTC (rev 6027)
+++ glibc-package/trunk/debian/patches/series	2014-04-28 02:42:40 UTC (rev 6028)
@@ -60,6 +60,8 @@
 arm/cvs-arm__longjmp-thumb.diff
 arm/cvs-arm__sigsetjmp-thumb.diff
 arm/cvs-arm-pointer-mangle-frame.diff
+arm/cvs-arm-always-blx.diff
+arm/cvs-memcpy-align.diff
 
 arm64/cvs-arm64-sigcontext.diff
 arm64/cvs-arm64-relocs.diff


Reply to: