[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: CFP (Call for Participation) Debian PPC64



On Mon, Nov 17, 2003 at 02:23:50PM +1030, Alan Modra wrote:
> On Sun, Nov 16, 2003 at 09:41:58PM -0500, Albert Cahalan wrote:
> > Using 16-byte stack alignment? More?
> 
> We've had 16 byte stack alignment on ppc64 forever.
> 
> > Keeping .text in the low 2 GB? Other stuff?
> 
> -maddr32 does the text and static data in -2G..2G thing.  Not in
>  mainline gcc because the powerpc maintainers didn't like it.  Anton is
>  messing around with it in the kernel at the moment, but results so far
>  are discouraging.  We're probably hitting some cache problems due to
>  mapping the kernel at -2G as well as 0xc << 60.  Then again, it might
>  be that David Edelsohn was right all along, and -maddr32 doesn't give
>  us any performance increase.
> 
> > Keeping anything in a 48-bit range?
> > (One less opcode than full 64-bit)
> 
> No.  ppc64 loads 64 bit addresses from the TOC.  We don't build up 64
> bit addresses from multiple insns each setting 16 bits at a time.

What does it do about floating point constants?

Recent versions of GCC have an annoying bug which makes many
(but not all) floating-point constant load take 3 instructions 
in 32 bit mode (never tried 64 bit, no hardware):

	li rx,.LCn@h@
	la ry,.LCn@l(rx)
	lfd frz,0(ry)

I have an older version (2.95.4) which at least managed to
combine the last two instructions as often as possible. Changing 
optimization options between -O2, -O3, and -Os does not fundamentally 
change the result.

IMHO, the compiler should decide to allocate one register to 
point to the constant pool in these cases. But I would not
even know where to start in GCC's code.

Now try with -fpic or -fPIC (also -m relocatable) option and 
try not to vomit when looking at the disgustingly bloated code.

I have appended an example extracted from a rather small function
(which is not a worst case because I use an inlined subroutine
to evaluate polynomials).

> In some ways the TOC is a good thing.  Think of it as a compiler
> generated GOT.  Then realize that the compiler can do better than the
> linker in placing entries for good cache performance.  On other Linux
> architectures, ld generates the GOT via a hash table traversal, which
> means entries are fairly well randomized.
> 
> > Still using "funtion pointers" that aren't?
> 
> I'm not sure what you mean here.

Function descriptors instead of single pointer to the first
instruction I believe. I'm not sure that it's that bad either.

> 
> > Anything left to improve?
> 
> Plenty.  For starters, I'd like to get rid of those pesky dot symbols.
> ie. A ".foo" symbol defined as well as a "foo" symbol for each function


	.file	"ndtri.c"
	.section	.rodata
	.align 3
[snipped]	
	.section	.rodata.cst8,"aM",@progbits,8
	.align 3
.LC0:
	.long	1072409429
	.long	1460674445
	.align 3
.LC1:
	.long	1069634218
	.long	-1547730484
	.align 3
.LC2:
	.long	1074007443
	.long	536225542
	.align 3
.LC3:
	.long	0
	.long	0
	.align 3
.LC4:
	.long	1072693248
	.long	0
	.align 3
.LC5:
	.long	1071644672
	.long	0
	.align 3
.LC6:
	.long	-1073741824
	.long	0
	.align 3
.LC7:
	.long	1075838976
	.long	0
	.align 3
.LC8:
	.long	-1048576
	.long	0
	.align 3
.LC9:
	.long	2146435072
	.long	0
	.align 3
.LC10:
	.long	2146959360
	.long	0
	.section	".text"
	.align 2
	.globl ndtri
	.type	ndtri, @function
ndtri:
1->	lis 9,.LC3@ha
	stwu 1,-48(1)
2->	la 9,.LC3@l(9)
	stfd 31,40(1)
3->	lfd 0,0(9)
	mflr 0
	stfd 30,32(1)
	fmr 31,1
	fcmpu 0,1,0
	stw 31,28(1)
	stw 0,52(1)
	bng- 0,.L2
1->	lis 9,.LC4@ha
2->	la 9,.LC4@l(9)
3->	lfd 30,0(9)
	fcmpu 0,1,30
	bnl- 0,.L2
	lis 9,.LC0@ha	# Not that bad here
	lfd 0,.LC0@l(9)
	li 31,1
	fcmpu 0,1,0
	bng- 0,.L5
	fsub 1,30,1
	li 31,0
.L5:
	lis 9,.LC1@ha	# Again acceptable
	lfd 0,.LC1@l(9)
	fcmpu 0,1,0
	bng- 0,.L7
1->	lis 9,.LC5@ha
2->	la 9,.LC5@l(9)
3->	lfd 0,0(9)
	li 0,4
	mtctr 0
	lis 9,P0@ha
	fsub 1,1,0
	lfd 13,P0@l(9)
	la 9,P0@l(9)
	addi 9,9,8
	fmul 12,1,1
.L55:
	lfd 0,0(9)
	addi 9,9,8
	fmadd 13,13,12,0
	bdnz .L55
	lis 9,Q0@ha
	fmul 11,12,13
	lfd 0,Q0@l(9)
	li 0,7
	mtctr 0
	la 9,Q0@l(9)
	fadd 13,12,0
	addi 9,9,8
.L54:
	lfd 0,0(9)
	addi 9,9,8
	fmadd 13,13,12,0
	bdnz .L54
	fdiv 0,11,13
	lis 9,.LC2@ha	# Ok
	fmadd 31,1,0,1
	lfd 0,.LC2@l(9)	# here
	fmul 1,31,0
	b .L1
.L7:
	bl log
1->	lis 9,.LC6@ha
2->	la 9,.LC6@l(9)
3->	lfd 0,0(9)
	fmul 1,1,0
	bl sqrt
	fmr 31,1
	bl log
1->	lis 9,.LC7@ha
2->	la 9,.LC7@l(9)
	fdiv 1,1,31
3->	lfd 0,0(9)
	fcmpu 0,31,0
	fdiv 12,30,31
	fsub 1,31,1
	bnl- 0,.L19
	li 0,8
	mtctr 0
	lis 9,P1@ha
	lfd 13,P1@l(9)
	la 9,P1@l(9)
	addi 9,9,8
.L53:
	lfd 0,0(9)
	addi 9,9,8
	fmadd 13,13,12,0
	bdnz .L53
	lis 9,Q1@ha
	fmul 11,12,13
	lfd 0,Q1@l(9)
	li 0,7
	mtctr 0
	la 9,Q1@l(9)
	fadd 13,12,0
	addi 9,9,8
.L52:
	lfd 0,0(9)
	addi 9,9,8
	fmadd 13,13,12,0
	bdnz .L52
	b .L56
.L19:
	li 0,8
	mtctr 0
	lis 9,P2@ha
	lfd 13,P2@l(9)
	la 9,P2@l(9)
	addi 9,9,8
.L51:
	lfd 0,0(9)
	addi 9,9,8
	fmadd 13,13,12,0
	bdnz .L51
	lis 9,Q2@ha
	fmul 11,12,13
	lfd 0,Q2@l(9)
	li 0,7
	mtctr 0
	la 9,Q2@l(9)
	fadd 13,12,0
	addi 9,9,8
.L50:
	lfd 0,0(9)
	addi 9,9,8
	fmadd 13,13,12,0
	bdnz .L50
.L56:
	fdiv 0,11,13
	cmpwi 0,31,0
	fsub 31,1,0
	beq- 0,.L42
	fneg 31,31
.L42:
	fmr 1,31
	b .L1
.L2:
	fcmpu 0,31,31
	beq- 0,.L44
	fadd 1,31,31
	b .L1
.L44:
	bl __errno_location
1->	lis 9,.LC3@ha
2->	la 9,.LC3@l(9)
3->	lfd 0,0(9)
	fcmpu 0,31,0
	bne+ 0,.L46
	li 0,34
	stw 0,0(3)
#APP
	mtfsb1 3
#NO_APP
	lis 9,.LC8@ha
	la 9,.LC8@l(9)
	b .L57
.L46:
1->	lis 9,.LC4@ha
2->	la 9,.LC4@l(9)
3->	lfd 0,0(9)
	fcmpu 0,31,0
	bne+ 0,.L48
	li 0,34
	stw 0,0(3)
#APP
	mtfsb1 3
#NO_APP
	lis 9,.LC9@ha	# See below
	la 9,.LC9@l(9)
	b .L57
.L48:
	li 0,33
	stw 0,0(3)
	lis 3,0x2000
	bl feraiseexcept
	lis 9,.LC10@ha	# See below
	la 9,.LC10@l(9)
.L57:
	lfd 1,0(9)	# Ok, returning the value pointed to by r9
# (It may not be the best implementation but it's an error path
# marked unlikely so I don't care very much)
.L1:
	lwz 0,52(1)
	lwz 31,28(1)
	lfd 30,32(1)
	mtlr 0
	lfd 31,40(1)
	addi 1,1,48
	blr
	.size	ndtri, .-ndtri
	.ident	"GCC: (GNU) 3.3.2 20030908 (Debian prerelease)"



Reply to: