Re: Bug#106716: gcc-3.0.1 refuses to compile Linux kernel
On 26 Jul 01 at 18:26, Christopher C. Chimelis wrote:
> > ncpfs code is affected by this (fortunately it still links as I
> > gave up teaching gcc to inline code long ago). And as there is no
> > documentation since when this option is supported, it is just guessing
> > (or someone has to add -flimit-inline check to autoconf or kernel makefile).
>
> Ok, to clarify what I think you're trying to say:
> gcc 2.95.x used -finline-limit-N
> gcc 3.0.x uses -finline-limit=N
> (no, that's not a typo on my part...the hyphen was replaced by an equal
> sign according to the docs)
>
> Since neither 2.95.x or 3.0.x understand the other's options, it's
> not inlining the code that needs to be? I understand what the various gcc
> passes do WRT inlining, so no need to explain that portion...I'm just
> trying to understand why changing the kernel Makefile is such a big deal,
> especially since the kernel folks haven't began recommending that the
Fortunately 3.0.x understands -finline-limit-N , so it can be simple
check for -fno-strict-aliasing is. Unless 3.0.2 will stop supporting
this older format.
> kernel be compiled with gcc 3.0.x yet anyway. There are MANY more
> pitfalls involved in compiling kernels with gcc 3.0.x on some
> platforms. I've had to patch Alpha's kernel sources just to compile the
> kernel at all with gcc 3.0.
> For the record, according to 2.4.6's Documentation/Changes file:
I was always under impression that in future you want to get old gcc2.91.xx
replaced with newer one. If newer one refuses to compile kernel...
>
> > extern inline void copy_siginfo(siginfo_t* to, siginfo_t* from) {
> > if (from->si_code < 0)
> > memcpy(to, from, sizeof(siginfo_t));
> > else
> > memcpy(to, from, 3*sizeof(int) + sizeof(from->_sifields._sigchld));
> > }
> >
> > then I think that there is something wrong...
>
> Hmm...I'm definitely going to have to play with this. Can you come up
> with a small testcase so that I can test other architectures as well?
After digging through - problem is not with copy_siginfo, but with
kernel's memcpy.
I understand that internal code for __constant_memcpy can be very large
due to switch(), but as only one branch is then really used (and
correctly retrieved in copy_siginfo), compiler should not give up
with function too large. It does what documentation says it does (modulo
default limit), but - it could even optimize whole function to nothing,
but still complain about non-inlinable code.
And btw, for this particular example __constant_memcpy inlines with
'-finline-limit=143' or larger. So I think that 100 is just unreasonable
low inline limit, as real examples show that code generated from such
function contains only 5 asm instructions... Maybe if optimizer could
check longest code path in __constant_memcpy instead of size of whole
function, it could work.
Or even better - I believe that there is bug in how size of
copy_siginfo was computed. As compiler knew size parameter passed to
__constant_memcpy even before whole copy_siginfo was compiled,
it should threw unused portions of inlined __constant_memcpy out,
should not? (and I do not believe that after this throwing out
copy_siginfo size is 143...)
Thanks,
Petr Vandrovec
vandrove@vc.cvut.cz
Generated code (+ 2 comments):
.file "xx.c"
.section .rodata
.LC0:
.string "%u\n"
.text
.align 16
.globl main
.type main,@function
main:
pushl %edi
xorl %eax, %eax
subl $288, %esp
cld
movl %esp, %edi
movl $33, %ecx
rep
stosl
movl %esp, %eax ## Why not push %esp?
pushl %eax ## It does same thing...
leal 148(%esp), %eax
pushl %eax
call copy_siginfo
movl 152(%esp), %eax ## pushl 152(%esp) ?
pushl %eax ## instead of using %eax
pushl $.LC0
call printf
addl $304, %esp
popl %edi
ret
.Lfe1:
.size main,.Lfe1-main
.align 16
.type copy_siginfo,@function
copy_siginfo:
pushl %edi
pushl %esi
movl 16(%esp), %esi
movl 12(%esp), %edi
movl (%esi), %edx
testl %edx, %edx
js .L64
movl $6, %ecx
.L65:
#APP
rep ; movsl
#NO_APP
popl %esi
popl %edi
ret
.p2align 4,,7
.L64:
movl $33, %ecx
jmp .L65
.Lfe2:
.size copy_siginfo,.Lfe2-copy_siginfo
.ident "GCC: (GNU) 3.0.1 20010721 (Debian prerelease)"
Source:
#include <linux/types.h>
static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return (to);
}
/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
switch (n) {
case 0:
return to;
case 1:
*(unsigned char *)to = *(const unsigned char *)from;
return to;
case 2:
*(unsigned short *)to = *(const unsigned short *)from;
return to;
case 3:
*(unsigned short *)to = *(const unsigned short *)from;
*(2+(unsigned char *)to) = *(2+(const unsigned char *)from);
return to;
case 4:
*(unsigned long *)to = *(const unsigned long *)from;
return to;
case 6: /* for Ethernet addresses */
*(unsigned long *)to = *(const unsigned long *)from;
*(2+(unsigned short *)to) = *(2+(const unsigned short *)from);
return to;
case 8:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
return to;
case 12:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
return to;
case 16:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
return to;
case 20:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
return to;
}
#define COMMON(x) \
__asm__ __volatile__( \
"rep ; movsl" \
x \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (n/4),"1" ((long) to),"2" ((long) from) \
: "memory");
{
int d0, d1, d2;
switch (n % 4) {
case 0: COMMON(""); return to;
case 1: COMMON("\n\tmovsb"); return to;
case 2: COMMON("\n\tmovsw"); return to;
default: COMMON("\n\tmovsw\n\tmovsb"); return to;
}
}
#undef COMMON
}
#if 0
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
__constant_memcpy((t),(f),(n)) : \
__memcpy((t),(f),(n)))
#else
#define memcpy(t, f, n) \
__constant_memcpy((t),(f),(n))
/*
#define memcpy(t, f, n) \
__memcpy((t),(f),(n))
works fine, so switch is a culprit */
#endif
/* XXX: This structure was copied from the Alpha; is there an iBCS version? */
typedef struct siginfo {
int si_code;
int _pad[32];
} siginfo_t;
/* change this to extern... */
static inline void copy_siginfo(siginfo_t *to, siginfo_t *from)
{
if (from->si_code < 0)
memcpy(to, from, sizeof(siginfo_t));
else
memcpy(to, from, sizeof(int) * 6);
}
void main(void) {
siginfo_t a;
siginfo_t b;
memset(&b, 0, sizeof(b));
copy_siginfo(&a, &b);
printf("%u\n", a.si_code);
}
Reply to: