Re: CFP (Call for Participation) Debian PPC64
On Mon, Nov 17, 2003 at 02:23:50PM +1030, Alan Modra wrote:
> On Sun, Nov 16, 2003 at 09:41:58PM -0500, Albert Cahalan wrote:
> > Using 16-byte stack alignment? More?
>
> We've had 16 byte stack alignment on ppc64 forever.
>
> > Keeping .text in the low 2 GB? Other stuff?
>
> -maddr32 does the text and static data in -2G..2G thing. Not in
> mainline gcc because the powerpc maintainers didn't like it. Anton is
> messing around with it in the kernel at the moment, but results so far
> are discouraging. We're probably hitting some cache problems due to
> mapping the kernel at -2G as well as 0xc << 60. Then again, it might
> be that David Edelsohn was right all along, and -maddr32 doesn't give
> us any performance increase.
>
> > Keeping anything in a 48-bit range?
> > (One less opcode than full 64-bit)
>
> No. ppc64 loads 64 bit addresses from the TOC. We don't build up 64
> bit addresses from multiple insns each setting 16 bits at a time.
What does it do about floating point constants?
Recent versions of GCC have an annoying bug which makes many
(but not all) floating-point constant load take 3 instructions
in 32 bit mode (never tried 64 bit, no hardware):
li rx,.LCn@h@
la ry,.LCn@l(rx)
lfd frz,0(ry)
I have an older version (2.95.4) which at least managed to
combine the last two instructions as often as possible. Changing
optimization options between -O2, -O3, and -Os does not fundamentally
change the result.
IMHO, the compiler should decide to allocate one register to
point to the constant pool in these cases. But I would not
even know where to start in GCC's code.
Now try with -fpic or -fPIC (also -m relocatable) option and
try not to vomit when looking at the disgustingly bloated code.
I have appended an example extracted from a rather small function
(which is not a worst case because I use an inlined subroutine
to evaluate polynomials).
> In some ways the TOC is a good thing. Think of it as a compiler
> generated GOT. Then realize that the compiler can do better than the
> linker in placing entries for good cache performance. On other Linux
> architectures, ld generates the GOT via a hash table traversal, which
> means entries are fairly well randomized.
>
> > Still using "funtion pointers" that aren't?
>
> I'm not sure what you mean here.
Function descriptors instead of single pointer to the first
instruction I believe. I'm not sure that it's that bad either.
>
> > Anything left to improve?
>
> Plenty. For starters, I'd like to get rid of those pesky dot symbols.
> ie. A ".foo" symbol defined as well as a "foo" symbol for each function
.file "ndtri.c"
.section .rodata
.align 3
[snipped]
.section .rodata.cst8,"aM",@progbits,8
.align 3
.LC0:
.long 1072409429
.long 1460674445
.align 3
.LC1:
.long 1069634218
.long -1547730484
.align 3
.LC2:
.long 1074007443
.long 536225542
.align 3
.LC3:
.long 0
.long 0
.align 3
.LC4:
.long 1072693248
.long 0
.align 3
.LC5:
.long 1071644672
.long 0
.align 3
.LC6:
.long -1073741824
.long 0
.align 3
.LC7:
.long 1075838976
.long 0
.align 3
.LC8:
.long -1048576
.long 0
.align 3
.LC9:
.long 2146435072
.long 0
.align 3
.LC10:
.long 2146959360
.long 0
.section ".text"
.align 2
.globl ndtri
.type ndtri, @function
ndtri:
1-> lis 9,.LC3@ha
stwu 1,-48(1)
2-> la 9,.LC3@l(9)
stfd 31,40(1)
3-> lfd 0,0(9)
mflr 0
stfd 30,32(1)
fmr 31,1
fcmpu 0,1,0
stw 31,28(1)
stw 0,52(1)
bng- 0,.L2
1-> lis 9,.LC4@ha
2-> la 9,.LC4@l(9)
3-> lfd 30,0(9)
fcmpu 0,1,30
bnl- 0,.L2
lis 9,.LC0@ha # Not that bad here
lfd 0,.LC0@l(9)
li 31,1
fcmpu 0,1,0
bng- 0,.L5
fsub 1,30,1
li 31,0
.L5:
lis 9,.LC1@ha # Again acceptable
lfd 0,.LC1@l(9)
fcmpu 0,1,0
bng- 0,.L7
1-> lis 9,.LC5@ha
2-> la 9,.LC5@l(9)
3-> lfd 0,0(9)
li 0,4
mtctr 0
lis 9,P0@ha
fsub 1,1,0
lfd 13,P0@l(9)
la 9,P0@l(9)
addi 9,9,8
fmul 12,1,1
.L55:
lfd 0,0(9)
addi 9,9,8
fmadd 13,13,12,0
bdnz .L55
lis 9,Q0@ha
fmul 11,12,13
lfd 0,Q0@l(9)
li 0,7
mtctr 0
la 9,Q0@l(9)
fadd 13,12,0
addi 9,9,8
.L54:
lfd 0,0(9)
addi 9,9,8
fmadd 13,13,12,0
bdnz .L54
fdiv 0,11,13
lis 9,.LC2@ha # Ok
fmadd 31,1,0,1
lfd 0,.LC2@l(9) # here
fmul 1,31,0
b .L1
.L7:
bl log
1-> lis 9,.LC6@ha
2-> la 9,.LC6@l(9)
3-> lfd 0,0(9)
fmul 1,1,0
bl sqrt
fmr 31,1
bl log
1-> lis 9,.LC7@ha
2-> la 9,.LC7@l(9)
fdiv 1,1,31
3-> lfd 0,0(9)
fcmpu 0,31,0
fdiv 12,30,31
fsub 1,31,1
bnl- 0,.L19
li 0,8
mtctr 0
lis 9,P1@ha
lfd 13,P1@l(9)
la 9,P1@l(9)
addi 9,9,8
.L53:
lfd 0,0(9)
addi 9,9,8
fmadd 13,13,12,0
bdnz .L53
lis 9,Q1@ha
fmul 11,12,13
lfd 0,Q1@l(9)
li 0,7
mtctr 0
la 9,Q1@l(9)
fadd 13,12,0
addi 9,9,8
.L52:
lfd 0,0(9)
addi 9,9,8
fmadd 13,13,12,0
bdnz .L52
b .L56
.L19:
li 0,8
mtctr 0
lis 9,P2@ha
lfd 13,P2@l(9)
la 9,P2@l(9)
addi 9,9,8
.L51:
lfd 0,0(9)
addi 9,9,8
fmadd 13,13,12,0
bdnz .L51
lis 9,Q2@ha
fmul 11,12,13
lfd 0,Q2@l(9)
li 0,7
mtctr 0
la 9,Q2@l(9)
fadd 13,12,0
addi 9,9,8
.L50:
lfd 0,0(9)
addi 9,9,8
fmadd 13,13,12,0
bdnz .L50
.L56:
fdiv 0,11,13
cmpwi 0,31,0
fsub 31,1,0
beq- 0,.L42
fneg 31,31
.L42:
fmr 1,31
b .L1
.L2:
fcmpu 0,31,31
beq- 0,.L44
fadd 1,31,31
b .L1
.L44:
bl __errno_location
1-> lis 9,.LC3@ha
2-> la 9,.LC3@l(9)
3-> lfd 0,0(9)
fcmpu 0,31,0
bne+ 0,.L46
li 0,34
stw 0,0(3)
#APP
mtfsb1 3
#NO_APP
lis 9,.LC8@ha
la 9,.LC8@l(9)
b .L57
.L46:
1-> lis 9,.LC4@ha
2-> la 9,.LC4@l(9)
3-> lfd 0,0(9)
fcmpu 0,31,0
bne+ 0,.L48
li 0,34
stw 0,0(3)
#APP
mtfsb1 3
#NO_APP
lis 9,.LC9@ha # See below
la 9,.LC9@l(9)
b .L57
.L48:
li 0,33
stw 0,0(3)
lis 3,0x2000
bl feraiseexcept
lis 9,.LC10@ha # See below
la 9,.LC10@l(9)
.L57:
lfd 1,0(9) # Ok, returning the value pointed to by r9
# (It may not be the best implementation but it's an error path
# marked unlikely so I don't care very much)
.L1:
lwz 0,52(1)
lwz 31,28(1)
lfd 30,32(1)
mtlr 0
lfd 31,40(1)
addi 1,1,48
blr
.size ndtri, .-ndtri
.ident "GCC: (GNU) 3.3.2 20030908 (Debian prerelease)"
Reply to: