[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#699462: unblock (pre-approval): openblas/0.1.1-7



Package: release.debian.org
Severity: normal
User: release.debian.org@packages.debian.org
Usertags: unblock
X-Debbugs-CC: sylvestre@debian.org

Dear Release Team,

I would like to make the attached changes to the openblas package. The debdiff
fixes several important bugs, related to crashes or wrong results in widely
used functions (dot product and matrix-vector product). It also fixes support
for 32-bit Athlon CPUs.

I consider that this patch complies with freeze policy, but since it is not so
small and is difficult to parse (x86 assembly), I prefer to ask for
pre-approval before upload.

Regards,

-- 
 .''`.    Sébastien Villemot
: :' :    Debian Developer
`. `'     http://www.dynare.org/sebastien
  `-      GPG Key: 4096R/381A7594
diff -Nru openblas-0.1.1/debian/changelog openblas-0.1.1/debian/changelog
--- openblas-0.1.1/debian/changelog	2012-08-11 17:50:30.000000000 +0200
+++ openblas-0.1.1/debian/changelog	2013-01-31 15:21:20.000000000 +0100
@@ -1,3 +1,16 @@
+openblas (0.1.1-7) UNRELEASED; urgency=low
+
+  * sgemv_uninitialized_buffer.diff: new patch taken from upstream, ensures that
+    vectorized sgemv does not use uninitialized data (Closes: #696000)
+  * dot_uninitialized_buffer.diff: new patch taken from upstream, ensures that
+    vectorized dot does not use uninitialized data
+  * gemv_crash_big_data.diff: new patch taken from upstream, fixes crashes of
+    gemv on big input data (Closes: #697231)
+  * 32bit_athlon.diff: new patch taken from upstream, fixes crashes on 32-bit
+    Athlon CPUs (Closes: #697233)
+
+ -- Sébastien Villemot <sebastien@debian.org>  Sat, 05 Jan 2013 14:13:23 +0100
+
 openblas (0.1.1-6) unstable; urgency=low
 
   * kill_threads_at_unload.diff: new patch, taken upstream (Closes: #673061)
diff -Nru openblas-0.1.1/debian/patches/32bit_athlon.diff openblas-0.1.1/debian/patches/32bit_athlon.diff
--- openblas-0.1.1/debian/patches/32bit_athlon.diff	1970-01-01 01:00:00.000000000 +0100
+++ openblas-0.1.1/debian/patches/32bit_athlon.diff	2013-01-05 15:07:32.000000000 +0100
@@ -0,0 +1,21 @@
+Description: Fix crash on 32-bit Athlon CPU
+Origin: upstream, https://github.com/xianyi/OpenBLAS/commit/9fb341a9f8d94e4d532d51b1216d92e74a67a569
+Bug-Debian: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=697233
+Last-Update: 2013-01-04
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/kernel/setparam-ref.c
++++ b/kernel/setparam-ref.c
+@@ -634,10 +634,10 @@ static void init_parameter(void) {
+   TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
+ #endif
+ 
+-#if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH)
++#if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
+ 
+ #ifdef DEBUG
+-  fprintf(stderr, "Katmai, Coppermine, Banias\n");
++  fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n");
+ #endif
+ 
+   TABLE_NAME.sgemm_p =  64 * (l2 >> 7);
diff -Nru openblas-0.1.1/debian/patches/dot_uninitialized_buffer.diff openblas-0.1.1/debian/patches/dot_uninitialized_buffer.diff
--- openblas-0.1.1/debian/patches/dot_uninitialized_buffer.diff	1970-01-01 01:00:00.000000000 +0100
+++ openblas-0.1.1/debian/patches/dot_uninitialized_buffer.diff	2013-01-31 15:19:23.000000000 +0100
@@ -0,0 +1,75 @@
+Description: Ensure that vectorized dot product does not use uninitialized data
+Origin: upstream,
+        https://github.com/xianyi/OpenBLAS/commit/d311236dfdefa41f31a2e7fefa548abf47f0461c
+Bug: https://github.com/xianyi/OpenBLAS/issues/189
+Last-Update: 2013-01-31
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/kernel/x86_64/dot_sse.S
++++ b/kernel/x86_64/dot_sse.S
+@@ -530,7 +530,7 @@
+ #endif
+ 	movsd	-32 * SIZE(Y), %xmm8
+ 
+-	pshufd	$0x39, %xmm4,  %xmm5
++	pshufd	$0x29, %xmm4,  %xmm5
+ 
+ 	mulps	%xmm8,  %xmm5
+ 	addps	%xmm5,  %xmm3
+@@ -750,7 +750,8 @@
+ 	xorps	%xmm5, %xmm5
+ 	movhlps	%xmm4, %xmm5
+ 
+-	mulps	-32 * SIZE(Y), %xmm5
++	movlps  -32 * SIZE(Y), %xmm4
++	mulps	%xmm4, %xmm5
+ 	addps	%xmm5, %xmm0
+ 
+ 	addq	$2 * SIZE, X
+@@ -992,7 +993,7 @@
+ 	movsd	-32 * SIZE(Y), %xmm8
+ 
+ 	movss	%xmm5, %xmm4
+-	shufps	$0x93, %xmm5,  %xmm4
++	shufps	$0x93, %xmm4,  %xmm4
+ 
+ 	mulps	%xmm8,  %xmm4
+ 	addps	%xmm4,  %xmm3
+--- a/kernel/x86_64/zdot_sse.S
++++ b/kernel/x86_64/zdot_sse.S
+@@ -699,7 +699,7 @@
+ 	movsd	-32 * SIZE(X), %xmm4
+ 
+ 	pshufd	$0xb1,  %xmm4, %xmm12 
+-	shufps	$0x39,  %xmm8, %xmm8
++	shufps	$0x59,  %xmm8, %xmm8
+ 	mulps	%xmm8,  %xmm4
+ 	addps	%xmm4,  %xmm0
+ 	mulps	%xmm8,  %xmm12
+@@ -1336,7 +1336,7 @@
+ 
+ 	movss	%xmm9,  %xmm8
+ 	pshufd	$0xb1,  %xmm4, %xmm12 
+-	shufps	$0x93,  %xmm8, %xmm8
++	shufps	$0x03,  %xmm8, %xmm8
+ 	mulps	%xmm8,  %xmm4
+ 	addps	%xmm4,  %xmm0
+ 	mulps	%xmm8,  %xmm12
+@@ -1697,7 +1697,7 @@
+ 	movsd	-32 * SIZE(Y), %xmm4
+ 
+ 	pshufd	$0xb1,  %xmm4, %xmm12 
+-	shufps	$0x39,  %xmm8, %xmm8
++	shufps	$0xa9,  %xmm8, %xmm8
+ 	mulps	%xmm8,  %xmm4
+ 	addps	%xmm4,  %xmm0
+ 	mulps	%xmm8,  %xmm12
+@@ -2024,7 +2024,7 @@
+ 
+ 	movss	%xmm9,  %xmm8
+ 	pshufd	$0xb1,  %xmm4, %xmm12 
+-	shufps	$0x93,  %xmm8, %xmm8
++	shufps	$0x03,  %xmm8, %xmm8
+ 	mulps	%xmm8,  %xmm4
+ 	addps	%xmm4,  %xmm0
+ 	mulps	%xmm8,  %xmm12
diff -Nru openblas-0.1.1/debian/patches/gemv_crash_big_data.diff openblas-0.1.1/debian/patches/gemv_crash_big_data.diff
--- openblas-0.1.1/debian/patches/gemv_crash_big_data.diff	1970-01-01 01:00:00.000000000 +0100
+++ openblas-0.1.1/debian/patches/gemv_crash_big_data.diff	2013-01-31 15:14:36.000000000 +0100
@@ -0,0 +1,685 @@
+Description: Fix crashes of gemv on big input data
+Origin: upstream,
+        https://github.com/xianyi/OpenBLAS/commit/fd3046b32a1f7049fcb2bfb255d72e4204e5522e
+        https://github.com/xianyi/OpenBLAS/commit/0d1518add98bc3c0e83887be74cda3b23c8937ee
+        https://github.com/xianyi/OpenBLAS/commit/69200884e13e98b79487cfd1c78faf054278ec2f
+        https://github.com/xianyi/OpenBLAS/commit/5f0117385e1d4f986ad75fa66b873b014a7792c2
+        https://github.com/xianyi/OpenBLAS/commit/cea1a885b5cd38bea67feb6437ef0c3622a96c58
+        https://github.com/xianyi/OpenBLAS/commit/0b08f7479e26ce0ef8e076185bb89f16479335e9
+Bug: https://github.com/xianyi/OpenBLAS/issues/154
+     https://github.com/xianyi/OpenBLAS/issues/173
+Bug-Debian: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=697231
+Last-Update: 2013-01-31
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/kernel/x86/gemv_t_sse.S
++++ b/kernel/x86/gemv_t_sse.S
+@@ -89,17 +89,24 @@
+ #endif
+ 
+ #define STACKSIZE	16
++#define ARGS	20
+ 
+-#define M		 4 + STACKSIZE(%esp)
+-#define N		 8 + STACKSIZE(%esp)
+-#define ALPHA		16 + STACKSIZE(%esp)
+-#define A		20 + STACKSIZE(%esp)
+-#define STACK_LDA	24 + STACKSIZE(%esp)
+-#define STACK_X		28 + STACKSIZE(%esp)
+-#define STACK_INCX	32 + STACKSIZE(%esp)
+-#define Y		36 + STACKSIZE(%esp)
+-#define STACK_INCY	40 + STACKSIZE(%esp)
+-#define BUFFER		44 + STACKSIZE(%esp)
++#define M		 4 + STACKSIZE+ARGS(%esp)
++#define N		 8 + STACKSIZE+ARGS(%esp)
++#define ALPHA		16 + STACKSIZE+ARGS(%esp)
++#define A		20 + STACKSIZE+ARGS(%esp)
++#define STACK_LDA	24 + STACKSIZE+ARGS(%esp)
++#define STACK_X		28 + STACKSIZE+ARGS(%esp)
++#define STACK_INCX	32 + STACKSIZE+ARGS(%esp)
++#define Y		36 + STACKSIZE+ARGS(%esp)
++#define STACK_INCY	40 + STACKSIZE+ARGS(%esp)
++#define BUFFER		44 + STACKSIZE+ARGS(%esp)
++
++#define MMM	0+STACKSIZE(%esp)
++#define NN	4+STACKSIZE(%esp)
++#define AA	8+STACKSIZE(%esp)
++#define LDAX	12+STACKSIZE(%esp)
++#define XX	16+STACKSIZE(%esp)
+ 	
+ #define I	%eax
+ #define J	%ebx
+@@ -114,6 +121,7 @@
+ 
+ 	PROLOGUE
+ 
++	subl	$ARGS,%esp
+ 	pushl	%ebp
+ 	pushl	%edi
+ 	pushl	%esi
+@@ -122,7 +130,42 @@
+ 	PROFCODE
+ 
+ 	movl	STACK_LDA,  LDA
++	movl	LDA,LDAX			# backup LDA
+ 	movl	STACK_X,    X
++	movl	X,XX
++	movl	N,J
++	movl	J,NN				# backup N
++	movl	A,J
++	movl	J,AA				# backup A
++    movl	M,J
++	movl	J,MMM				# mov M to MMM
++.L0t:
++	xorl	J,J
++	addl	$1,J
++	sall    $22,J                           # J=2^24*sizeof(float)=buffer size(16MB)
++	subl    $8, J                           # Don't use last 8 float in the buffer.
++	                                        # Now, split M by block J
++	subl	J,MMM				# MMM=MMM-J
++	movl	J,M		
++	jge		.L00t
++	ALIGN_4
++	
++	movl	MMM,%eax
++	addl	J,%eax
++	jle		.L999x
++	movl	%eax,M
++
++.L00t:
++	movl	AA,%eax
++	movl	%eax,A			 	# mov AA to A
++
++	movl	NN,%eax
++	movl	%eax,N				# reset N
++
++
++	movl	LDAX,  LDA			# reset LDA
++	movl	XX,X
++
+ 	movl	STACK_INCX, INCX
+ 	movl	STACK_INCY, INCY
+ 
+@@ -642,10 +685,22 @@
+ 	ALIGN_4
+  	
+ .L999:
++	movl	M,J
++	leal	(,J,SIZE),%eax
++	addl	%eax,AA
++	movl	XX,J
++	addl	%eax,J
++	movl	J,XX
++	jmp		.L0t
++	ALIGN_4
++
++.L999x:
+ 	popl	%ebx
+ 	popl	%esi
+ 	popl	%edi	
+ 	popl	%ebp
++
++	addl	$ARGS,%esp
+ 	ret
+ 
+ 	EPILOGUE
+--- a/kernel/x86/gemv_t_sse2.S
++++ b/kernel/x86/gemv_t_sse2.S
+@@ -76,18 +76,24 @@
+ #endif
+ 
+ #define STACKSIZE	16
++#define ARGS	16
++
++#define M		 4 + STACKSIZE+ARGS(%esp)
++#define N		 8 + STACKSIZE+ARGS(%esp)
++#define ALPHA		16 + STACKSIZE+ARGS(%esp)
++#define A		24 + STACKSIZE+ARGS(%esp)
++#define STACK_LDA	28 + STACKSIZE+ARGS(%esp)
++#define STACK_X		32 + STACKSIZE+ARGS(%esp)
++#define STACK_INCX	36 + STACKSIZE+ARGS(%esp)
++#define Y		40 + STACKSIZE+ARGS(%esp)
++#define STACK_INCY	44 + STACKSIZE+ARGS(%esp)
++#define BUFFER		48 + STACKSIZE+ARGS(%esp)
++
++#define MMM	0+STACKSIZE(%esp)
++#define AA	4+STACKSIZE(%esp)
++#define LDAX 8+STACKSIZE(%esp)
++#define NN	12+STACKSIZE(%esp)
+ 
+-#define M		 4 + STACKSIZE(%esp)
+-#define N		 8 + STACKSIZE(%esp)
+-#define ALPHA		16 + STACKSIZE(%esp)
+-#define A		24 + STACKSIZE(%esp)
+-#define STACK_LDA	28 + STACKSIZE(%esp)
+-#define STACK_X		32 + STACKSIZE(%esp)
+-#define STACK_INCX	36 + STACKSIZE(%esp)
+-#define Y		40 + STACKSIZE(%esp)
+-#define STACK_INCY	44 + STACKSIZE(%esp)
+-#define BUFFER		48 + STACKSIZE(%esp)
+-	
+ #define I	%eax
+ #define J	%ebx
+ 
+@@ -101,6 +107,8 @@
+ 
+ 	PROLOGUE
+ 
++	subl	$ARGS,%esp
++
+ 	pushl	%ebp
+ 	pushl	%edi
+ 	pushl	%esi
+@@ -108,7 +116,40 @@
+ 
+ 	PROFCODE
+ 
++
+ 	movl	STACK_LDA,  LDA
++	movl	LDA,LDAX			# backup LDA
++	movl	N,J
++	movl	J,NN				# backup N
++	movl	A,J
++	movl	J,AA				# backup A
++    movl	M,J
++	movl	J,MMM				# mov M to MMM
++.L0t:
++	xorl	J,J
++	addl	$1,J
++	sall    $21,J                           # J=2^21*sizeof(double)=buffer size(16MB)
++	subl    $4, J                           # Don't use last 4 double in the buffer.
++	                                        # Now, split M by block J
++	subl	J,MMM				# MMM=MMM-J
++	movl	J,M		
++	jge		.L00t
++	ALIGN_4
++	
++	movl	MMM,%eax
++	addl	J,%eax
++	jle		.L999x
++	movl	%eax,M
++
++.L00t:
++	movl	AA,%eax
++	movl	%eax,A			 	# mov AA to A
++
++	movl	NN,%eax
++	movl	%eax,N				# reset N
++
++
++	movl	LDAX,  LDA			# reset LDA
+ 	movl	STACK_X,    X
+ 	movl	STACK_INCX, INCX
+ 	movl	STACK_INCY, INCY
+@@ -117,6 +158,7 @@
+ 	leal	(,INCY, SIZE), INCY
+ 	leal	(,LDA,  SIZE), LDA
+ 
++
+ 	subl	$-16 * SIZE, A
+ 
+ 	cmpl	$0, N
+@@ -560,10 +602,19 @@
+ 	ALIGN_4
+ 	
+ .L999:
++	movl 	M,J
++	leal 	(,J,SIZE),%eax
++	addl	%eax,AA
++	jmp		.L0t
++	ALIGN_4
++
++.L999x:
+ 	popl	%ebx
+ 	popl	%esi
+ 	popl	%edi	
+ 	popl	%ebp
++
++	addl	$ARGS,%esp
+ 	ret
+ 
+ 	EPILOGUE
+--- a/kernel/x86_64/sgemv_t.S
++++ b/kernel/x86_64/sgemv_t.S
+@@ -47,7 +47,7 @@
+ 	
+ #ifndef WINDOWS_ABI
+ 
+-#define STACKSIZE	64
++#define STACKSIZE	128
+ 	
+ #define OLD_M	  %rdi
+ #define OLD_N	  %rsi
+@@ -57,6 +57,10 @@
+ #define STACK_Y		16 + STACKSIZE(%rsp)
+ #define STACK_INCY	24 + STACKSIZE(%rsp)
+ #define STACK_BUFFER	32 + STACKSIZE(%rsp)
++#define MMM		56(%rsp)
++#define NN		64(%rsp)
++#define AA		72(%rsp)
++#define LDAX	80(%rsp)
+ 
+ #else
+ 
+@@ -71,6 +75,10 @@
+ #define STACK_Y		 72 + STACKSIZE(%rsp)
+ #define STACK_INCY	 80 + STACKSIZE(%rsp)
+ #define STACK_BUFFER	 88 + STACKSIZE(%rsp)
++#defien MMM	216(%rsp)
++#defien NN	224(%rsp)
++#define AA	232(%rsp)
++#define LDAX 240(%rsp)
+ 
+ #endif
+ 
+@@ -127,29 +135,46 @@
+ 	movups	%xmm14, 192(%rsp)
+ 	movups	%xmm15, 208(%rsp)
+ 
+-	movq	OLD_M,	      M
+-	movq	OLD_N,        N
+-	movq	OLD_A,        A
+-	movq	OLD_LDA,      LDA
++	movq	OLD_M,	      MMM
++	movq	OLD_N,        NN
++	movq	OLD_A,        AA
++	movq	OLD_LDA,      LDAX
+ 	movq	OLD_X,        X
+ #else
+-	movq	OLD_M,	      M
+-	movq	OLD_N,        N
+-	movq	OLD_A,        A
+-	movq	OLD_LDA,      LDA
++	movq	OLD_M,	      MMM
++	movq	OLD_N,        NN
++	movq	OLD_A,        AA
++	movq	OLD_LDA,      LDAX
+ #endif
+-
+-	movq	STACK_INCX,   INCX
+-	movq	STACK_Y,      Y
+-	movq	STACK_INCY,   INCY
+-	movq	STACK_BUFFER, BUFFER
+-
+ #ifndef WINDOWS_ABI
+ 	pshufd	$0, %xmm0, ALPHA
+ #else
+ 	pshufd	$0, %xmm3, ALPHA
+ #endif
+ 
++
++.L0t:
++	xorq	M,M
++	addq	$1,M
++	salq	$22,M
++	subq	M,MMM
++	jge		.L00t
++	ALIGN_4
++	
++	movq	MMM,%rax
++	addq	M,%rax
++	jle		.L999x
++	movq	%rax,M
++
++.L00t:
++	movq	LDAX,LDA
++	movq	NN,N
++	movq	AA,A
++	movq	STACK_INCX,   INCX
++	movq	STACK_Y,      Y
++	movq	STACK_INCY,   INCY
++	movq	STACK_BUFFER, BUFFER
++
+ 	leaq	(,INCX, SIZE), INCX
+ 	leaq	(,INCY, SIZE), INCY
+ 	leaq	(,LDA,  SIZE), LDA
+@@ -6341,6 +6366,12 @@
+ 	ALIGN_4
+ 
+ .L999:
++	leaq	(,M,SIZE),%rax
++	addq	%rax,AA
++	jmp		.L0t
++	ALIGN_4
++
++.L999x:
+ 	movq	  0(%rsp), %rbx
+ 	movq	  8(%rsp), %rbp
+ 	movq	 16(%rsp), %r12
+--- a/kernel/x86/gemv_n_sse.S
++++ b/kernel/x86/gemv_n_sse.S
+@@ -89,17 +89,22 @@
+ #endif
+ 
+ #define STACKSIZE	16
++#define ARGS	16
+ 
+-#define M		 4 + STACKSIZE(%esp)
+-#define N		 8 + STACKSIZE(%esp)
+-#define ALPHA		16 + STACKSIZE(%esp)
+-#define A		20 + STACKSIZE(%esp)
+-#define STACK_LDA	24 + STACKSIZE(%esp)
+-#define STACK_X		28 + STACKSIZE(%esp)
+-#define STACK_INCX	32 + STACKSIZE(%esp)
+-#define Y		36 + STACKSIZE(%esp)
+-#define STACK_INCY	40 + STACKSIZE(%esp)
+-#define BUFFER		44 + STACKSIZE(%esp)
++#define M		 4 + STACKSIZE+ARGS(%esp)
++#define N		 8 + STACKSIZE+ARGS(%esp)
++#define ALPHA		16 + STACKSIZE+ARGS(%esp)
++#define A		20 + STACKSIZE+ARGS(%esp)
++#define STACK_LDA	24 + STACKSIZE+ARGS(%esp)
++#define STACK_X		28 + STACKSIZE+ARGS(%esp)
++#define STACK_INCX	32 + STACKSIZE+ARGS(%esp)
++#define Y		36 + STACKSIZE+ARGS(%esp)
++#define STACK_INCY	40 + STACKSIZE+ARGS(%esp)
++#define BUFFER		44 + STACKSIZE+ARGS(%esp)
++#define MMM	0+ARGS(%esp)
++#define YY	4+ARGS(%esp)
++#define AA	8+ARGS(%esp)
++#define LDAX	12+ARGS(%esp)
+ 	
+ #define I	%eax
+ #define J	%ebx
+@@ -114,6 +119,7 @@
+ 
+ 	PROLOGUE
+ 
++	subl	$ARGS,%esp
+ 	pushl	%ebp
+ 	pushl	%edi
+ 	pushl	%esi
+@@ -121,7 +127,34 @@
+ 
+ 	PROFCODE
+ 
++	movl	Y,J
++	movl	J,YY				# backup Y
++	movl	A,J
++	movl	J,AA				# backup A
++	movl	M,J
++	movl	J,MMM				# backup MM
++.L0t:
++	xorl	J,J
++	addl	$1,J
++	sall	$21,J
++	subl	J,MMM
++	movl	J,M
++	jge		.L00t
++	ALIGN_4
++
++	movl	MMM,%eax
++	addl	J,%eax
++	jle		.L999x
++	movl	%eax,M
++
++.L00t:
++	movl	AA,%eax
++	movl	%eax,A
++
++	movl	YY,J
++	movl	J,Y
+ 	movl	STACK_LDA,  LDA
++
+ 	movl	STACK_X,    X
+ 	movl	STACK_INCX, INCX
+ 
+@@ -651,12 +684,22 @@
+ 	addss	0 * SIZE(X), %xmm0
+ 	movss	%xmm0, (Y1)
+ 	ALIGN_3
+-
+ .L999:
++	movl	M,J
++	leal	(,J,SIZE),%eax
++	addl	%eax,AA
++	movl	YY,J
++	addl	%eax,J
++	movl	J,YY
++	jmp		.L0t
++	ALIGN_4
++
++.L999x:
+ 	popl	%ebx
+ 	popl	%esi
+ 	popl	%edi	
+ 	popl	%ebp
++	addl	$ARGS,%esp
+ 	ret
+ 
+ 	EPILOGUE
+--- a/kernel/x86/gemv_n_sse2.S
++++ b/kernel/x86/gemv_n_sse2.S
+@@ -76,17 +76,22 @@
+ #endif
+ 
+ #define STACKSIZE	16
++#define ARGS	16
+ 
+-#define M		 4 + STACKSIZE(%esp)
+-#define N		 8 + STACKSIZE(%esp)
+-#define ALPHA		16 + STACKSIZE(%esp)
+-#define A		24 + STACKSIZE(%esp)
+-#define STACK_LDA	28 + STACKSIZE(%esp)
+-#define STACK_X		32 + STACKSIZE(%esp)
+-#define STACK_INCX	36 + STACKSIZE(%esp)
+-#define Y		40 + STACKSIZE(%esp)
+-#define STACK_INCY	44 + STACKSIZE(%esp)
+-#define BUFFER		48 + STACKSIZE(%esp)
++#define M		 4 + STACKSIZE+ARGS(%esp)
++#define N		 8 + STACKSIZE+ARGS(%esp)
++#define ALPHA		16 + STACKSIZE+ARGS(%esp)
++#define A		24 + STACKSIZE+ARGS(%esp)
++#define STACK_LDA	28 + STACKSIZE+ARGS(%esp)
++#define STACK_X		32 + STACKSIZE+ARGS(%esp)
++#define STACK_INCX	36 + STACKSIZE+ARGS(%esp)
++#define Y		40 + STACKSIZE+ARGS(%esp)
++#define STACK_INCY	44 + STACKSIZE+ARGS(%esp)
++#define BUFFER		48 + STACKSIZE+ARGS(%esp)
++
++#define MMM	0+ARGS(%esp)
++#define YY	4+ARGS(%esp)
++#define AA	8+ARGS(%esp)
+ 	
+ #define I	%eax
+ #define J	%ebx
+@@ -101,6 +106,8 @@
+ 
+ 	PROLOGUE
+ 
++
++	subl	$ARGS,%esp
+ 	pushl	%ebp
+ 	pushl	%edi
+ 	pushl	%esi
+@@ -108,6 +115,33 @@
+ 
+ 	PROFCODE
+ 
++	movl	Y,J
++	movl	J,YY				# backup Y
++	movl	A,J
++	movl	J,AA				# backup A
++	movl	M,J
++	movl	J,MMM				# backup MM
++.L0t:
++	xorl	J,J
++	addl	$1,J
++	sall	$20,J
++	subl	J,MMM
++	movl	J,M
++	jge		.L00t
++	ALIGN_4
++
++	movl	MMM,%eax
++	addl	J,%eax
++	jle		.L999x
++	movl	%eax,M
++
++.L00t:
++	movl	AA,%eax
++	movl	%eax,A
++
++	movl	YY,J
++	movl	J,Y
++
+ 	movl	STACK_LDA,  LDA
+ 	movl	STACK_X,    X
+ 	movl	STACK_INCX, INCX
+@@ -677,10 +711,22 @@
+ 	ALIGN_3
+ 
+ .L999:
++	movl	M,J
++	leal	(,J,SIZE),%eax
++	addl	%eax,AA
++	movl	YY,J
++	addl	%eax,J
++	movl	J,YY
++	jmp		.L0t
++	ALIGN_4
++
++.L999x:
++
+ 	popl	%ebx
+ 	popl	%esi
+ 	popl	%edi	
+ 	popl	%ebp
++	addl	$ARGS,%esp
+ 	ret
+ 
+ 	EPILOGUE
+--- a/kernel/x86_64/dgemv_t.S
++++ b/kernel/x86_64/dgemv_t.S
+@@ -47,7 +47,7 @@
+ 
+ #ifndef WINDOWS_ABI
+ 
+-#define STACKSIZE	64
++#define STACKSIZE	128
+ 	
+ #define OLD_M	  %rdi
+ #define OLD_N	  %rsi
+@@ -57,7 +57,10 @@
+ #define STACK_Y		16 + STACKSIZE(%rsp)
+ #define STACK_INCY	24 + STACKSIZE(%rsp)
+ #define STACK_BUFFER	32 + STACKSIZE(%rsp)
+-
++#define MMM	56(%rsp)
++#define NN	64(%rsp)
++#define AA	72(%rsp)
++#define LDAX	80(%rsp)
+ #else
+ 
+ #define STACKSIZE	256
+@@ -71,6 +74,11 @@
+ #define STACK_Y		 72 + STACKSIZE(%rsp)
+ #define STACK_INCY	 80 + STACKSIZE(%rsp)
+ #define STACK_BUFFER	 88 + STACKSIZE(%rsp)
++//Temp variables for M,N,A,LDA
++#define MMM	224(%rsp)
++#define NN	232(%rsp)
++#define AA	240(%rsp)
++#define LDAX	248(%rsp)
+ 
+ #endif
+ 
+@@ -131,13 +139,51 @@
+ 	movq	OLD_A,        A
+ 	movq	OLD_LDA,      LDA
+ 	movq	OLD_X,        X
++
++	movq	M,	      MMM
++	movq	N,            NN
++	movq	A,            AA
++	movq	LDA,	      LDAX
++
+ #else
+-	movq	OLD_M,	      M
+-	movq	OLD_N,        N
+-	movq	OLD_A,        A
+-	movq	OLD_LDA,      LDA
++	movq	OLD_M,	      MMM
++	movq	OLD_N,        NN
++	movq	OLD_A,        AA
++	movq	OLD_LDA,      LDAX
++#endif
++#ifdef HAVE_SSE3
++#ifndef WINDOWS_ABI
++	movddup	%xmm0, ALPHA
++#else
++	movddup	%xmm3, ALPHA
++#endif
++#else
++#ifndef WINDOWS_ABI
++	movapd	%xmm0, ALPHA
++#else
++	movapd	%xmm3, ALPHA
++#endif
++	unpcklpd ALPHA, ALPHA
+ #endif
+ 
++
++
++.L0x:
++	xorq	M,M
++	addq	$1,M
++	salq	$22,M
++	subq	M,MMM
++	jge .L00
++
++	movq	MMM,%rax
++	addq	M,%rax
++	jle	.L999x
++	movq	%rax,M
++
++.L00:	
++	movq	LDAX,LDA
++	movq	NN,N
++	movq	AA,A
+ 	movq	STACK_INCX,   INCX
+ 	movq	STACK_Y,      Y
+ 	movq	STACK_INCY,   INCY
+@@ -153,21 +199,6 @@
+ 
+ 	subq	$-16 * SIZE, A
+ 
+-#ifdef HAVE_SSE3
+-#ifndef WINDOWS_ABI
+-	movddup	%xmm0, ALPHA
+-#else
+-	movddup	%xmm3, ALPHA
+-#endif
+-#else
+-#ifndef WINDOWS_ABI
+-	movapd	%xmm0, ALPHA
+-#else
+-	movapd	%xmm3, ALPHA
+-#endif
+-	unpcklpd ALPHA, ALPHA
+-#endif
+-
+ 	testq	M, M
+ 	jle	.L999
+ 	testq	N, N
+@@ -854,7 +885,6 @@
+ 
+ .L21:
+ #endif
+-
+ 	subq	$4, N
+ 
+ 	leaq	16 * SIZE(BUFFER), X1
+@@ -2461,6 +2491,12 @@
+ 	ALIGN_4
+ 
+ .L999:
++	leaq	(, M, SIZE), %rax
++	addq %rax,AA
++	jmp .L0x;
++	ALIGN_4
++
++.L999x:
+ 	movq	  0(%rsp), %rbx
+ 	movq	  8(%rsp), %rbp
+ 	movq	 16(%rsp), %r12
diff -Nru openblas-0.1.1/debian/patches/series openblas-0.1.1/debian/patches/series
--- openblas-0.1.1/debian/patches/series	2012-08-11 17:50:07.000000000 +0200
+++ openblas-0.1.1/debian/patches/series	2013-01-31 15:21:20.000000000 +0100
@@ -3,3 +3,7 @@
 hurd.diff
 generic_profile.diff
 kill_threads_at_unload.diff
+32bit_athlon.diff
+sgemv_uninitialized_buffer.diff
+gemv_crash_big_data.diff
+dot_uninitialized_buffer.diff
diff -Nru openblas-0.1.1/debian/patches/sgemv_uninitialized_buffer.diff openblas-0.1.1/debian/patches/sgemv_uninitialized_buffer.diff
--- openblas-0.1.1/debian/patches/sgemv_uninitialized_buffer.diff	1970-01-01 01:00:00.000000000 +0100
+++ openblas-0.1.1/debian/patches/sgemv_uninitialized_buffer.diff	2013-01-31 15:21:20.000000000 +0100
@@ -0,0 +1,30 @@
+Description: Ensure that vectorized sgemv does not use uninitialized data
+Origin: upstream, https://github.com/xianyi/OpenBLAS/commit/91ed4e4450ceabd71493e0bf80e7455df414bebf
+Bug: https://github.com/xianyi/OpenBLAS/issues/171
+Bug-Debian: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=696000
+Last-Update: 2013-01-04
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/kernel/x86/gemv_t_sse.S
++++ b/kernel/x86/gemv_t_sse.S
+@@ -198,6 +198,20 @@
+ 	jg	.L06
+ 	ALIGN_4
+ 
++//Padding zero to prevent loading the dirty number from buffer.
++	movl	M,  I
++	movl	$8, J
++	andl	$7, I
++	xorps	%xmm0, %xmm0
++	subl	I, J
++	ALIGN_2
++.L07:
++	movss	%xmm0, 0 * SIZE(Y1)
++	addl	$SIZE, Y1
++	decl	J
++	jg	.L07
++	ALIGN_4
++
+ .L10:
+ 	movl	Y, Y1
+ 

Attachment: signature.asc
Description: Digital signature


Reply to: