Bug#588509: linux-image-2.6.32-5-amd64: hangs under Xen after Write protecting the kernel read-only data: 4220k
Here's some debugging output. I installed
linux-image-2.6.32-5-amd64-dbg and linux-source-2.6.32 and then used
sudo gdbserver-xen 127.0.0.1:9999 --attach 37
where 37 is the domid printed by xm list and
sudo gdb /local/lindi/vmlinux-2.6.32-5-amd64
(gdb) target remote 127.0.0.1:9999
on the opensuse dom0 to attach gdb to linux. It seems that Linux (-17)
is stuck trying to print an OOPS:
(gdb) info threads
* 1 Remote target 0xffffffff812f8c78 in __ticket_spin_lock (
lock=<value optimized out>)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/include/asm/spinlock.h:114
(gdb) bt full
#0 0xffffffff812f8c78 in __ticket_spin_lock (lock=<value optimized out>)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/include/asm/spinlock.h:114
inc = 1
tmp = 0
#1 __raw_spin_lock (lock=<value optimized out>)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/include/asm/spinlock.h:190
No locals.
#2 __spin_lock (lock=<value optimized out>)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/include/linux/spinlock_api_smp.h:337
No locals.
#3 _spin_lock (lock=<value optimized out>)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/kernel/spinlock.c:138
No locals.
#4 0xffffffff8104d344 in vprintk (
fmt=0xffffffff813a8798 "<1>BUG: unable to handle kernel ",
args=0xffff88003fdb93a8)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/kernel/printk.c:705
printed_len = 0
current_log_level = 4
flags = 0
p = <value optimized out>
#5 0xffffffff812f6eec in printk (fmt=0xffffffff815d13c0 "")
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/kernel/printk.c:595
---Type <return> to continue, or q <return> to quit---
args = {{gp_offset = 8, fp_offset = 48,
overflow_arg_area = 0xffff88003fdb9488,
reg_save_area = 0xffff88003fdb93c8}}
r = 1
#6 0xffffffff810311b0 in show_fault_oops (address=<value optimized out>,
error_code=<value optimized out>, regs=<value optimized out>)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/mm/fault.c:583
No locals.
#7 no_context (address=<value optimized out>,
error_code=<value optimized out>, regs=<value optimized out>)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/mm/fault.c:658
tsk = 0xffff88003fdb4000
flags = 0
sig = <value optimized out>
#8 0xffffffff81031429 in __bad_area_nosemaphore (regs=0xffff88003fdb9598,
error_code=<value optimized out>, address=18446744071582043416,
si_code=196609)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/mm/fault.c:741
tsk = 0xffff88003fdb4000
#9 0xffffffff812f9015 in ?? ()
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/kernel/entry_64.S:1383
No locals.
#10 0x0005398d646e415d in ?? ()
No symbol table info available.
#11 0x00000000d5de35ab in ?? ()
---Type <return> to continue, or q <return> to quit---
No symbol table info available.
#12 0x0000000000000002 in ?? ()
No symbol table info available.
#13 0x000c82198774ef00 in ?? ()
No symbol table info available.
#14 0x0000000000000001 in ?? ()
No symbol table info available.
#15 0xffff8800032e6040 in ?? ()
No symbol table info available.
#16 0x0000000000000202 in ?? ()
No symbol table info available.
#17 0xffff880000000000 in ?? ()
No symbol table info available.
#18 0xffffffff813a87b8 in ?? ()
__kcrctab_unregister_net_sysctl_table = 2623496682
__kstrtab_unregister_net_sysctl_table = "unregister_net_sysctl_table"
net_sysctl_ro_root = {root_list = {next = 0xffffffff814649b0,
prev = 0xffffffff814a6350}, default_set = {list = {
next = 0xffff88003fe34c08, prev = 0xffff88003d8dd808},
parent = 0xffffffff814649c0, is_seen = 0}, lookup = 0,
permissions = 0xffffffff812e4313 <net_ctl_ro_header_perms>}
sysctl_pernet_ops = {list = {next = 0xffffffff8149bcc0,
prev = 0xffffffff814a6140},
init = 0xffffffff812e4323 <sysctl_net_init>,
exit = 0xffffffff812e4375 <sysctl_net_exit>}
__ksymtab_register_net_sysctl_table = {value = 18446744071581877167,
name = 0xffffffff81422f46 "register_net_sysctl_table"}
__kstrtab_register_net_sysctl_table = "register_net_sysctl_table"
__initcall_sysctl_init4 = 0
---Type <return> to continue, or q <return> to quit---
__kcrctab_register_net_sysctl_table = 829491708
__ksymtab_register_net_sysctl_rotable = {value = 18446744071581877142,
name = 0xffffffff81422f2a "register_net_sysctl_rotable"}
__ksymtab_unregister_net_sysctl_table = {value = 18446744071581877137,
name = 0xffffffff81422f0e "unregister_net_sysctl_table"}
net_sysctl_root = {root_list = {next = 0xffffffff814a6390,
prev = 0xffffffff814649b0}, default_set = {list = {next = 0x0,
prev = 0x0}, parent = 0x0, is_seen = 0},
lookup = 0xffffffff812e42e8 <net_ctl_header_lookup>,
permissions = 0xffffffff812e4342 <net_ctl_permissions>}
__kcrctab_register_net_sysctl_rotable = 2528268687
__kstrtab_register_net_sysctl_rotable = "register_net_sysctl_rotable"
#19 0x0000000000000020 in ?? ()
No symbol table info available.
#20 0x0005398d90ee1210 in ?? ()
No symbol table info available.
#21 0x00000000f2572b01 in ?? ()
No symbol table info available.
#22 0x0005398d9113c2e5 in ?? ()
No symbol table info available.
#23 0xffffffff813a87b8 in ?? ()
__kcrctab_unregister_net_sysctl_table = 2623496682
__kstrtab_unregister_net_sysctl_table = "unregister_net_sysctl_table"
net_sysctl_ro_root = {root_list = {next = 0xffffffff814649b0,
prev = 0xffffffff814a6350}, default_set = {list = {
next = 0xffff88003fe34c08, prev = 0xffff88003d8dd808},
parent = 0xffffffff814649c0, is_seen = 0}, lookup = 0,
permissions = 0xffffffff812e4313 <net_ctl_ro_header_perms>}
sysctl_pernet_ops = {list = {next = 0xffffffff8149bcc0,
---Type <return> to continue, or q <return> to quit---
prev = 0xffffffff814a6140},
init = 0xffffffff812e4323 <sysctl_net_init>,
exit = 0xffffffff812e4375 <sysctl_net_exit>}
__ksymtab_register_net_sysctl_table = {value = 18446744071581877167,
name = 0xffffffff81422f46 "register_net_sysctl_table"}
__kstrtab_register_net_sysctl_table = "register_net_sysctl_table"
__initcall_sysctl_init4 = 0
__kcrctab_register_net_sysctl_table = 829491708
__ksymtab_register_net_sysctl_rotable = {value = 18446744071581877142,
name = 0xffffffff81422f2a "register_net_sysctl_rotable"}
__ksymtab_unregister_net_sysctl_table = {value = 18446744071581877137,
name = 0xffffffff81422f0e "unregister_net_sysctl_table"}
net_sysctl_root = {root_list = {next = 0xffffffff814a6390,
prev = 0xffffffff814649b0}, default_set = {list = {next = 0x0,
prev = 0x0}, parent = 0x0, is_seen = 0},
lookup = 0xffffffff812e42e8 <net_ctl_header_lookup>,
permissions = 0xffffffff812e4342 <net_ctl_permissions>}
__kcrctab_register_net_sysctl_rotable = 2528268687
__kstrtab_register_net_sysctl_rotable = "register_net_sysctl_rotable"
#24 0xffff8800032e6040 in ?? ()
No symbol table info available.
#25 0xffffffffffffffff in ?? ()
No symbol table info available.
#26 0xffffffff8102bddf in pvclock_clocksource_read (src=0x1)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/kernel/pvclock.c:145
ret = 0
offset = <value optimized out>
last = 1
---Type <return> to continue, or q <return> to quit---
#27 0x0000000000000000 in ?? ()
No symbol table info available.
We probably should figure out what caused the OOPS so the following
information about printk being stuck is probably not very relevant:
(gdb) x/10i $rip
0xffffffff812f8c78 <__ticket_spin_lock+21>: movzwl (%rdi),%edx
0xffffffff812f8c7b <__ticket_spin_lock+24>:
jmp 0xffffffff812f8c72 <__ticket_spin_lock+15>
0xffffffff812f8c7d <_spin_lock+26>: retq
0xffffffff812f8c7e <__raw_write_lock>: subl $0x1000000,%ds:(%rdi)
0xffffffff812f8c85 <__raw_write_lock+7>:
je 0xffffffff812f8c8c <_write_lock+14>
0xffffffff812f8c87 <__raw_write_lock+9>: callq 0xffffffff81193b80
0xffffffff812f8c8c <_write_lock+14>: retq
0xffffffff812f8c8d <__ticket_spin_unlock>: incw (%rdi)
0xffffffff812f8c90 <raw_local_irq_restore>: mov %rsi,%rdi
0xffffffff812f8c93 <raw_local_irq_restore+3>: callq *0xffffffff814629b8
(gdb) si
0xffffffff812f8c7b 114 asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
(gdb) si
0xffffffff812f8c72 114 asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
(gdb) x/10i $rip
0xffffffff812f8c72 <__ticket_spin_lock+15>: cmp %eax,%edx
0xffffffff812f8c74 <__ticket_spin_lock+17>:
je 0xffffffff812f8c7d <_spin_lock+26>
0xffffffff812f8c76 <__ticket_spin_lock+19>: pause
0xffffffff812f8c78 <__ticket_spin_lock+21>: movzwl (%rdi),%edx
0xffffffff812f8c7b <__ticket_spin_lock+24>:
jmp 0xffffffff812f8c72 <__ticket_spin_lock+15>
0xffffffff812f8c7d <_spin_lock+26>: retq
0xffffffff812f8c7e <__raw_write_lock>: subl $0x1000000,%ds:(%rdi)
0xffffffff812f8c85 <__raw_write_lock+7>:
je 0xffffffff812f8c8c <_write_lock+14>
0xffffffff812f8c87 <__raw_write_lock+9>: callq 0xffffffff81193b80
0xffffffff812f8c8c <_write_lock+14>: retq
(gdb) info register
rax 0x1 1
rbx 0x0 0
rcx 0xfffedb34 4294892340
rdx 0x0 0
rsi 0xffff88003fdb93a8 -131940323978328
rdi 0xffffffff815d13c0 -2124606528
rbp 0xffff88003fdb93a8 0xffff88003fdb93a8
rsp 0xffff88003fdb92f0 0xffff88003fdb92f0
r8 0x2 2
r9 0xffff88003fdb4000 -131940323999744
r10 0xffff880000000000 -131941395333120
r11 0x202 514
r12 0xffffffff813a8798 -2126870632
r13 0x4 4
r14 0x3 3
r15 0x0 0
rip 0xffffffff812f8c72 0xffffffff812f8c72 <__ticket_spin_lock+15>
eflags 0x1397 [ CF PF AF SF TF IF #12 ]
cs 0xe033 57395
ss 0xe02b 57387
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
fctrl 0x37f 895
fstat 0x0 0
ftag 0xffff 65535
fiseg 0x0 0
---Type <return> to continue, or q <return> to quit---
fioff 0x0 0
foseg 0x0 0
fooff 0x0 0
fop 0x0 0
mxcsr 0x1f80 [ IM DM ZM OM UM PM ]
(gdb)
However, it seems that the OOPS is caused by a write (error_code==3)
to variable called "last_value" that is in read-only memory (.rodata):
(gdb) frame 7
#7 no_context (address=<value optimized out>,
error_code=<value optimized out>, regs=<value optimized out>)
at /build/buildd-linux-2.6_2.6.32-17-amd64-FO3n8v/linux-2.6-2.6.32/debian/build/source_amd64_none/arch/x86/mm/fault.c:658
658 show_fault_oops(regs, error_code, address);
(gdb) p/x *regs
$10 = {r15 = 0x5398d646e415d, r14 = 0xd5de35ab, r13 = 0x2,
r12 = 0xc82198774ef00, bp = 0x1, bx = 0xffff8800032e6040, r11 = 0x202,
r10 = 0xffff880000000000, r9 = 0xffffffff813a87b8, r8 = 0x20,
ax = 0x5398d90ee1210, cx = 0xf2572b01, dx = 0x5398d9113c2e5,
si = 0xffffffff813a87b8, di = 0xffff8800032e6040,
orig_ax = 0xffffffffffffffff, ip = 0xffffffff8102bddf, cs = 0x10000e030,
flags = 0x10002, sp = 0xffff88003fdb9640, ss = 0xe02b}
(gdb) p error_code
$11 = 3
(gdb) p/x address
$22 = 0xffffffff8130cd18
(gdb) x/6i regs->ip
0xffffffff8102bddf <__cmpxchg>:
cmpxchg %rdx,%ds:0x2e0f30(%rip) # 0xffffffff8130cd18
0xffffffff8102bde8 <pvclock_clocksource_read+122>: cmp %rdx,%rax
0xffffffff8102bdeb <pvclock_clocksource_read+125>:
jne 0xffffffff8102bdda <pvclock_clocksource_read+108>
0xffffffff8102bded <pvclock_clocksource_read+127>: pop %rcx
0xffffffff8102bdee <pvclock_clocksource_read+128>: pop %rbx
0xffffffff8102bdef <pvclock_clocksource_read+129>: pop %rbp
(gdb) shell objdump -axdt /local/lindi/vmlinux-2.6.32-5-amd64 | grep ffffffff8130cd18
ffffffff8130cd18 l O .rodata 0000000000000008 last_value
ffffffff8102bdd3: 48 8b 05 3e 0f 2e 00 mov 0x2e0f3e(%rip),%rax # ffffffff8130cd18 <last_value>
ffffffff8102bddf: f0 48 0f b1 15 30 0f lock cmpxchg %rdx,0x2e0f30(%rip) # ffffffff8130cd18 <last_value>
What is interesting is that -15 does not contain such a variable:
$ objdump -axdt 15/usr/lib/debug/boot/vmlinux-2.6.32-5-amd64|grep last_value ffffffff81671780 l O .bss 0000000000000001 last_value.26980
ffffffff811f5947: 0f b6 05 32 be 47 00 movzbl 0x47be32(%rip),%eax # ffffffff81671780 <last_value.26980>
ffffffff811f5955: 88 15 25 be 47 00 mov %dl,0x47be25(%rip) # ffffffff81671780 <last_value.26980>
$ objdump -axdt 17/usr/lib/debug/boot/vmlinux-2.6.32-5-amd64|grep last_value
ffffffff8130cd18 l O .rodata 0000000000000008 last_value
ffffffff81675bc0 l O .bss 0000000000000001 last_value.26980
ffffffff8102bdd3: 48 8b 05 3e 0f 2e 00 mov 0x2e0f3e(%rip),%rax # ffffffff8130cd18 <last_value>
ffffffff8102bddf: f0 48 0f b1 15 30 0f lock cmpxchg %rdx,0x2e0f30(%rip) # ffffffff8130cd18 <last_value>
ffffffff811f49f7: 0f b6 05 c2 11 48 00 movzbl 0x4811c2(%rip),%eax # ffffffff81675bc0 <last_value.26980>
ffffffff811f4a05: 88 15 b5 11 48 00 mov %dl,0x4811b5(%rip) # ffffffff81675bc0 <last_value.26980>
interdiff -z linux-2.6_2.6.32-15.diff.gz linux-2.6_2.6.32-17.diff.gz
shows
++static atomic64_t last_value = ATOMIC64_INIT(0);
++
+ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+ {
+ struct pvclock_shadow_time shadow;
+ unsigned version;
+ cycle_t ret, offset;
++ u64 last;
+
+ do {
+ version = pvclock_get_time_values(&shadow, src);
+@@ -123,6 +126,27 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+ barrier();
+ } while (version != src->version);
+
++ /*
++ * Assumption here is that last_value, a global accumulator, always goes
++ * forward. If we are less than that, we should not be much smaller.
++ * We assume there is an error marging we're inside, and then the correction
++ * does not sacrifice accuracy.
++ *
++ * For reads: global may have changed between test and return,
++ * but this means someone else updated poked the clock at a later time.
++ * We just need to make sure we are not seeing a backwards event.
++ *
++ * For updates: last_value = ret is not enough, since two vcpus could be
++ * updating at the same time, and one of them could be slightly behind,
++ * making the assumption that last_value always go forward fail to hold.
++ */
++ last = atomic64_read(&last_value);
++ do {
++ if (ret < last)
++ return last;
++ last = atomic64_cmpxchg(&last_value, last, ret);
++ } while (unlikely(last != ret));
++
+ return ret;
+ }
+
-Timo
Reply to: