[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#512050: gcc-4.3: pessimizes function without SSE intrinsics



On Fri, Jan 16, 2009 at 08:10:15PM +0100, Martin Michlmayr wrote:
* brian m. carlson <sandals@crustytoothpaste.ath.cx> [2009-01-16 18:38]:
Obviously, since the two functions do the exact same thing, they should
be optimized to be identical.  Instead, mul is pessimized.

Can you check if this happens with gcc-4.3 and trunk from SVN, and if
so, file the bug upstream?

It does happen with gcc-4.3 (hence, I filed the bug there) as well as
gcc-snapshot 20090107-1.  I really would prefer not to build from SVN if
I don't have to.

I believe that it's the maintainers job to file the bug upstream, and
upstream has not won any points with me for neglecting a bug that I
reported with a trivial patch.  Thus, I am hesitant to forward the bug
myself.

I have been informed that Apple's GCC does better on this[0]; I will see
if I can get my friend to provide a .i and .s file from that version.  I
also just noted that gcc-4.1 and gcc-4.2 produce much less bad code.
I've attached intrinsics.s from both of those files; they each use 8
movss and 4 mulss.  Nevertheless, they still do not convert the code
into three SSE instructions.

[0] I believe Apple's GCC is based on an older version of FSF gcc, which
is probably why it does better; the results are likely identical between
the two.

--
brian m. carlson / brian with sandals: Houston, Texas, US
+1 713 440 7475 | http://crustytoothpaste.ath.cx/~bmc | My opinion only
troff on top of XML: http://crustytoothpaste.ath.cx/~bmc/code/thwack
OpenPGP: RSA v4 4096b 88AC E9B2 9196 305B A994 7552 F1BA 225C 0223 B187
	.file	"intrinsics.c"
	.text
	.p2align 4,,15
.globl mul
	.type	mul, @function
mul:
.LFB491:
	movss	(%rdi), %xmm0
	mulss	(%rsi), %xmm0
	movss	%xmm0, (%rdx)
	movss	4(%rdi), %xmm0
	mulss	4(%rsi), %xmm0
	movss	%xmm0, 4(%rdx)
	movss	8(%rdi), %xmm0
	mulss	8(%rsi), %xmm0
	movss	%xmm0, 8(%rdx)
	movss	12(%rdi), %xmm0
	mulss	12(%rsi), %xmm0
	movss	%xmm0, 12(%rdx)
	ret
.LFE491:
	.size	mul, .-mul
	.p2align 4,,15
.globl mul2
	.type	mul2, @function
mul2:
.LFB492:
	movaps	(%rdi), %xmm0
	mulps	(%rsi), %xmm0
	movaps	%xmm0, (%rdx)
	ret
.LFE492:
	.size	mul2, .-mul2
	.section	.rodata.str1.1,"aMS",@progbits,1
.LC8:
	.string	"%f %f %f %f\n"
	.section	.rodata.cst8,"aM",@progbits,8
	.align 8
.LC13:
	.long	1610612736
	.long	-1071225242
	.align 8
.LC14:
	.long	3758096384
	.long	1075212451
	.align 8
.LC15:
	.long	536870912
	.long	1075983155
	.align 8
.LC16:
	.long	3221225472
	.long	-1075125945
	.text
	.p2align 4,,15
.globl main
	.type	main, @function
main:
.LFB493:
	subq	$56, %rsp
.LCFI0:
	movl	$.LC8, %edi
	movl	$4, %eax
	movsd	.LC13(%rip), %xmm3
	movl	$0x3f99999a, 32(%rsp)
	movsd	.LC14(%rip), %xmm2
	movl	$0x40600000, 36(%rsp)
	movsd	.LC15(%rip), %xmm1
	movl	$0x3fd9999a, 40(%rsp)
	movsd	.LC16(%rip), %xmm0
	movl	$0x40333333, 44(%rsp)
	movl	$0xbf333333, 16(%rsp)
	movl	$0x40266666, 20(%rsp)
	movl	$0x40533333, 24(%rsp)
	movl	$0xc0800000, 28(%rsp)
	movl	$0xbf570a3e, (%rsp)
	movl	$0x41119999, 4(%rsp)
	movl	$0x40b3851f, 8(%rsp)
	movl	$0xc1333333, 12(%rsp)
	call	printf
	xorl	%eax, %eax
	addq	$56, %rsp
	ret
.LFE493:
	.size	main, .-main
	.section	.eh_frame,"a",@progbits
.Lframe1:
	.long	.LECIE1-.LSCIE1
.LSCIE1:
	.long	0x0
	.byte	0x1
	.string	"zR"
	.uleb128 0x1
	.sleb128 -8
	.byte	0x10
	.uleb128 0x1
	.byte	0x3
	.byte	0xc
	.uleb128 0x7
	.uleb128 0x8
	.byte	0x90
	.uleb128 0x1
	.align 8
.LECIE1:
.LSFDE1:
	.long	.LEFDE1-.LASFDE1
.LASFDE1:
	.long	.LASFDE1-.Lframe1
	.long	.LFB491
	.long	.LFE491-.LFB491
	.uleb128 0x0
	.align 8
.LEFDE1:
.LSFDE3:
	.long	.LEFDE3-.LASFDE3
.LASFDE3:
	.long	.LASFDE3-.Lframe1
	.long	.LFB492
	.long	.LFE492-.LFB492
	.uleb128 0x0
	.align 8
.LEFDE3:
.LSFDE5:
	.long	.LEFDE5-.LASFDE5
.LASFDE5:
	.long	.LASFDE5-.Lframe1
	.long	.LFB493
	.long	.LFE493-.LFB493
	.uleb128 0x0
	.byte	0x4
	.long	.LCFI0-.LFB493
	.byte	0xe
	.uleb128 0x40
	.align 8
.LEFDE5:
	.ident	"GCC: (GNU) 4.1.3 20080704 (prerelease) (Debian 4.1.2-24)"
	.section	.note.GNU-stack,"",@progbits
	.file	"intrinsics.c"
	.text
	.p2align 4,,15
.globl mul
	.type	mul, @function
mul:
.LFB513:
	movss	(%rdi), %xmm0
	mulss	(%rsi), %xmm0
	movss	%xmm0, (%rdx)
	movss	4(%rdi), %xmm0
	mulss	4(%rsi), %xmm0
	movss	%xmm0, 4(%rdx)
	movss	8(%rdi), %xmm0
	mulss	8(%rsi), %xmm0
	movss	%xmm0, 8(%rdx)
	movss	12(%rdi), %xmm0
	mulss	12(%rsi), %xmm0
	movss	%xmm0, 12(%rdx)
	ret
.LFE513:
	.size	mul, .-mul
	.p2align 4,,15
.globl mul2
	.type	mul2, @function
mul2:
.LFB514:
	movaps	(%rdi), %xmm0
	mulps	(%rsi), %xmm0
	movaps	%xmm0, (%rdx)
	ret
.LFE514:
	.size	mul2, .-mul2
	.section	.rodata.str1.1,"aMS",@progbits,1
.LC8:
	.string	"%f %f %f %f\n"
	.text
	.p2align 4,,15
.globl main
	.type	main, @function
main:
.LFB515:
	subq	$56, %rsp
.LCFI0:
	movl	$.LC8, %edi
	movl	$4, %eax
	movsd	.LC14(%rip), %xmm3
	movl	$0x3f99999a, 32(%rsp)
	movsd	.LC15(%rip), %xmm2
	movl	$0x40600000, 36(%rsp)
	movsd	.LC16(%rip), %xmm1
	movl	$0x3fd9999a, 40(%rsp)
	movsd	.LC10(%rip), %xmm0
	movl	$0x40333333, 44(%rsp)
	movl	$0xbf333333, 16(%rsp)
	movl	$0x40266666, 20(%rsp)
	movl	$0x40533333, 24(%rsp)
	movl	$0xc0800000, 28(%rsp)
	movl	$0xbf570a3e, (%rsp)
	movl	$0x41119999, 4(%rsp)
	movl	$0x40b3851f, 8(%rsp)
	movl	$0xc1333333, 12(%rsp)
	call	printf
	xorl	%eax, %eax
	addq	$56, %rsp
	ret
.LFE515:
	.size	main, .-main
	.section	.rodata.cst8,"aM",@progbits,8
	.align 8
.LC10:
	.long	3221225472
	.long	-1075125945
	.align 8
.LC14:
	.long	1610612736
	.long	-1071225242
	.align 8
.LC15:
	.long	3758096384
	.long	1075212451
	.align 8
.LC16:
	.long	536870912
	.long	1075983155
	.section	.eh_frame,"a",@progbits
.Lframe1:
	.long	.LECIE1-.LSCIE1
.LSCIE1:
	.long	0x0
	.byte	0x1
	.string	"zR"
	.uleb128 0x1
	.sleb128 -8
	.byte	0x10
	.uleb128 0x1
	.byte	0x3
	.byte	0xc
	.uleb128 0x7
	.uleb128 0x8
	.byte	0x90
	.uleb128 0x1
	.align 8
.LECIE1:
.LSFDE1:
	.long	.LEFDE1-.LASFDE1
.LASFDE1:
	.long	.LASFDE1-.Lframe1
	.long	.LFB513
	.long	.LFE513-.LFB513
	.uleb128 0x0
	.align 8
.LEFDE1:
.LSFDE3:
	.long	.LEFDE3-.LASFDE3
.LASFDE3:
	.long	.LASFDE3-.Lframe1
	.long	.LFB514
	.long	.LFE514-.LFB514
	.uleb128 0x0
	.align 8
.LEFDE3:
.LSFDE5:
	.long	.LEFDE5-.LASFDE5
.LASFDE5:
	.long	.LASFDE5-.Lframe1
	.long	.LFB515
	.long	.LFE515-.LFB515
	.uleb128 0x0
	.byte	0x4
	.long	.LCFI0-.LFB515
	.byte	0xe
	.uleb128 0x40
	.align 8
.LEFDE5:
	.ident	"GCC: (GNU) 4.2.4 (Debian 4.2.4-5)"
	.section	.note.GNU-stack,"",@progbits

Attachment: signature.asc
Description: Digital signature


Reply to: