Bug#580889: linux-image-2.6.32-3-amd64: tasks get stuck at pvclock_clocksource_read under xen)
Hi,
here's some more info on the issue.
I used
xm dump-core orbit-lucid orbit-lucid.dump
to dump the domU state to disk and then
crash -e emacs /usr/lib/debug/boot/vmlinux-2.6.32-5-amd64 orbit-lucid.dump
to debug it further. The following applies to version 2.6.32-23.
"ps" shows that kjournald (pid 122) is in "D" state so I used "set
122" to switch to that task. I then also loaded the jbd module to crash using
"mod -s jbd /usr/lib/debug/lib/modules/2.6.32-5-amd64/kernel/fs/jbd/jbd.ko".
The stack trace shown by "bt -f"
PID: 122 TASK: ffff88007da99530 CPU: 0 COMMAND: "kjournald"
#0 [ffff88000288fb80] schedule at ffffffff812f97e6
ffff88000288fb88: 0000000000000246 0000000000000000
ffff88000288fb98: ffff88007e216300 0000000000000001
ffff88000288fba8: 0000000000002cbe 000000000000f9e0
ffff88000288fbb8: ffff88000288ffd8 0000000000015780
ffff88000288fbc8: 0000000000015780 ffff88007da99530
ffff88000288fbd8: ffff88007da99828 00000000540b0200
ffff88000288fbe8: ffffffff8102cdac ffff88007da99828
ffff88000288fbf8: ffffffff814d0b00 ffffffffb365c74e
ffff88000288fc08: ffff88007ffd0af8 00000000024832ea
ffff88000288fc18: 000000003a1a2946 ffffffff8110d79a
ffff88000288fc28: ffff88000328b780 ffff88000288fcd0
ffff88000288fc38: ffff88007da99530 ffff88000328bf98
ffff88000288fc48: 0000000000000002 ffffffff8110d79a
ffff88000288fc58: ffffffff812f9939
#1 [ffff88000288fc58] io_schedule at ffffffff812f9939
ffff88000288fc60: 0000000000000000 ffff88000288fce0
ffff88000288fc70: ffff88000326ca00 ffffffff8110d7d5
#2 [ffff88000288fc78] sync_buffer at ffffffff8110d7d5
ffff88000288fc80: ffffffff812fabfa ffffffff812f9e46
#3 [ffff88000288fc88] __wait_on_bit at ffffffff812f9e46
ffff88000288fc90: ffff880050254e70 ffff8800502365b0
ffff88000288fca0: 0000000000000072 0000000000000002
ffff88000288fcb0: 0000000000000002 ffffffff8110d79a
ffff88000288fcc0: ffff880050236610 ffffffff812f9ee0
#4 [ffff88000288fcc8] out_of_line_wait_on_bit at ffffffff812f9ee0
ffff88000288fcd0: ffff8800502365b0 0000000000000002
ffff88000288fce0: 0000000000000000 ffff88007da99530
ffff88000288fcf0: ffffffff81064a1c ffff88000326ca08
ffff88000288fd00: ffff88000326ca08 ffff8800502365b0
ffff88000288fd10: 0000000000000001 ffff8800502365b0
ffff88000288fd20: ffff8800502344d0 ffff88007e16a9c0
ffff88000288fd30: ffff880002884800 ffffffffa000f1d1
#5 [ffff88000288fd38] journal_commit_transaction at ffffffffa000f1d1
ffff88000288fd40: ffff88007da99530 ffff88000288ffd8
ffff88000288fd50: 0002055153745261 ffff880002884824
ffff88000288fd60: ffff88000288494c ffffffff8145d1f0
ffff88000288fd70: ffff88007db3e000 ffff88000328b780
ffff88000288fd80: ffffffff00000000 00000001812f985f
ffff88000288fd90: ffff880002884824 ffff880050236610
ffff88000288fda0: ffff88007db3e000 000000728100dbe1
ffff88000288fdb0: ffffffff8100dbe1 ffffffff8100e1e2
ffff88000288fdc0: 0000000000000001 ffff88007e360000
ffff88000288fdd0: ffffffff8100e1cf ffff88000288e000
ffff88000288fde0: 00000000000102c4 00000000000102c4
ffff88000288fdf0: 0000000000000001 ffffffff8100dbe1
ffff88000288fe00: ffff88007ff74000 ffffffff8100e1cf
ffff88000288fe10: ffffffff812fabfa 000000005b832b59
ffff88000288fe20: ffff8800028802d0 0000000000000000
ffff88000288fe30: ffff880002884800 ffff880002884824
ffff88000288fe40: ffff88000288fe80 ffff880002884898
ffff88000288fe50: ffff880002884990 ffffffffa0012423
#6 [ffff88000288fe58] kjournald at ffffffffa0012423
is not very readable. However, the backtrace I get with "bt -F" looks
quite useful when I annotate it with source code and comments. I just
copied code of the functions from kernel source and then guessed how
the stack contents maps to local variables:
crash> bt -F
PID: 122 TASK: ffff88007da99530 CPU: 0 COMMAND: "kjournald"
#0 [ffff88000288fb80] schedule at ffffffff812f97e6
ffff88000288fb88: 0000000000000246 0000000000000000
ffff88000288fb98: [kmalloc-192] 0000000000000001
ffff88000288fba8: 0000000000002cbe 000000000000f9e0
ffff88000288fbb8: ffff88000288ffd8 0000000000015780
ffff88000288fbc8: 0000000000015780 ffff88007da99530
ffff88000288fbd8: ffff88007da99828 00000000540b0200
ffff88000288fbe8: pvclock_clocksource_read+58 ffff88007da99828
ffff88000288fbf8: xen_clocksource ffffffffb365c74e
ffff88000288fc08: [task_delay_info] 00000000024832ea
ffff88000288fc18: 000000003a1a2946 sync_buffer
ffff88000288fc28: ffff88000328b780 ffff88000288fcd0
ffff88000288fc38: ffff88007da99530 ffff88000328bf98
ffff88000288fc48: 0000000000000002 sync_buffer
ffff88000288fc58: io_schedule+115
#1 [ffff88000288fc58] io_schedule at ffffffff812f9939
void __sched io_schedule(void)
{
struct rq *rq = raw_rq();
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
} ffff88000288fc60: 0000000000000000 ffff88000288fce0
ffff88000288fc70: ffff88000326ca00 sync_buffer+59
#2 [ffff88000288fc78] sync_buffer at ffffffff8110d7d5
static int sync_buffer(void *word)
{
struct block_device *bd;
struct buffer_head *bh
= container_of(word, struct buffer_head, b_state);
smp_mb();
bd = bh->b_bdev;
if (bd)
blk_run_address_space(bd->bd_inode->i_mapping);
io_schedule();
return 0;
}
ffff88000288fc80: _spin_unlock_irqrestore+13 __wait_on_bit+65
#3 [ffff88000288fc88] __wait_on_bit at ffffffff812f9e46
int __sched
__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
int (*action)(void *), unsigned mode)
{ /// action == sync_buffer
int ret = 0;
do {
prepare_to_wait(wq, &q->wait, mode);
if (test_bit(q->key.bit_nr, q->key.flags)) // checks if bit is set (bit 2 of buffer_head here)
ret = (*action)(q->key.flags);
} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
finish_wait(wq, &q->wait);
return ret;
}
ffff88000288fc90: [buffer_head] [buffer_head]
ffff88000288fca0: 0000000000000072 0000000000000002
ffff88000288fcb0: 0000000000000002 sync_buffer
ffff88000288fcc0: [buffer_head] out_of_line_wait_on_bit+107
#4 [ffff88000288fcc8] out_of_line_wait_on_bit at ffffffff812f9ee0
int __sched out_of_line_wait_on_bit(void *word, int bit,
int (*action)(void *), unsigned mode)
{ // *zone = page_zone(virt_to_page(word));
wait_queue_head_t *wq = bit_waitqueue(word, bit); // &zone->wait_table[hash_long(val, zone->wait_table_bits)];
DEFINE_WAIT_BIT(wait, word, bit); // struct wait_bit_queue
struct wait_bit_queue {
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit) == (buffer_head, 2)
wait_queue_t wait;
}
SIZE: 56
struct wait_bit_key {
void *flags = buffer_head
int bit_nr = 2
}
typedef struct __wait_queue {
unsigned int flags == 0?
void *private = current
wait_queue_func_t func == wake_bit_function
struct list_head task_list;
} wait_queue_t;
struct list_head {
struct list_head *next;
struct list_head *prev;
}
return __wait_on_bit(wq, &wait, action, mode);
}
crash> waitq ffff88000326ca08
PID: 122 TASK: ffff88007da99530 CPU: 0 COMMAND: "kjournald"
ffff88000288fcd0: [buffer_head] (=ffff8800502365b0) 0000000000000002
ffff88000288fce0: 0000000000000000 ffff88007da99530 (=current)
ffff88000288fcf0: wake_bit_function ffff88000326ca08 (=next)
ffff88000288fd00: ffff88000326ca08(=prev) [buffer_head]
ffff88000288fd10: 0000000000000001 [buffer_head]
ffff88000288fd20: [journal_head] [kmalloc-192]
ffff88000288fd30: [kmalloc-1024] journal_commit_transaction+1288
#5 [ffff88000288fd38] journal_commit_transaction at ffffffffa000f1d1
ffff88000288fd40: ffff88007da99530 ffff88000288ffd8
ffff88000288fd50: 0002055153745261 [kmalloc-1024]
ffff88000288fd60: [kmalloc-1024] init_task
ffff88000288fd70: ffff88007db3e000 ffff88000328b780
ffff88000288fd80: ffffffff00000000 00000001812f985f
ffff88000288fd90: [kmalloc-1024] [buffer_head]
ffff88000288fda0: ffff88007db3e000 000000728100dbe1
ffff88000288fdb0: xen_force_evtchn_callback+9 check_events+18
ffff88000288fdc0: 0000000000000001 [mm_struct]
ffff88000288fdd0: xen_restore_fl_direct_end ffff88000288e000
ffff88000288fde0: 00000000000102c4 00000000000102c4
ffff88000288fdf0: 0000000000000001 xen_force_evtchn_callback+9
ffff88000288fe00: ffff88007ff74000 xen_restore_fl_direct_end
ffff88000288fe10: _spin_unlock_irqrestore+13 000000005b832b59
ffff88000288fe20: [kmalloc-192] 0000000000000000
ffff88000288fe30: [kmalloc-1024] [kmalloc-1024]
ffff88000288fe40: ffff88000288fe80 [kmalloc-1024]
ffff88000288fe50: [kmalloc-1024] kjournald+223
#6 [ffff88000288fe58] kjournald at ffffffffa0012423
ffff88000288fe60: ffff88000288fe98 ffff88007da99530
ffff88000288fe70: 000000018100b422 [kmalloc-1024]
ffff88000288fe80: 0000000000000000 ffff88007da99530
ffff88000288fe90: autoremove_wake_function ffff88000288fe98
ffff88000288fea0: ffff88000288fe98 0000000000000000
ffff88000288feb0: 0000000000000000 ffff88000288fef8
ffff88000288fec0: [kmalloc-1024] [vm_area_struct]
ffff88000288fed0: kjournald 0000000000000001
ffff88000288fee0: 0000000000000000 kthread+121
#7 [ffff88000288fee8] kthread at ffffffff81064721
ffff88000288fef0: 0000000000000000 0000000000000000
ffff88000288ff00: ffff880000000000 ffff88000288ff08
ffff88000288ff10: ffff88000288ff08 ffff8800032a9780
ffff88000288ff20: ffff88007da99530 [vm_area_struct]
ffff88000288ff30: [vm_area_struct] [task_struct]
ffff88000288ff40: [task_struct] child_rip+10
#8 [ffff88000288ff48] kernel_thread at ffffffff81011baa
=> out_of_line_wait_on_bit is waiting for the BH_Lock bit of the state
variable of the struct buffer_head at ffff8800502365b0 to be zero. The
contents of this buffer_head matches postscript that the end user was
operating at the time of the crash so I think I'm on the right track
here:
crash> struct buffer_head ffff8800502365b0
struct buffer_head {
b_state = 65581,
b_this_page = 0xffff8800502365b0,
b_page = 0xffffea0000dcf2d8,
b_blocknr = 282844,
b_size = 4096,
b_data = 0xffff88003f20d000 ".010)\" ExtendedRoman ROM\n*Font Univers-Bold: Standard \"(501.008)\" ExtendedRoman ROM\n*Font Univers-BoldExt: Standard \"(501.010)\" ExtendedRoman ROM\n*Font Univers-BoldExtObl: Standard \"(501.010)\" ExtendedRoman ROM\n*Font Univers-BoldOblique: Standard \"(501"...,
b_bdev = 0xffff88007fa40000,
b_end_io = 0xffffffff8110da9f <end_buffer_write_sync>,
b_private = 0xffff8800502344d0,
b_assoc_buffers = {
next = 0xffff8800502365f8,
prev = 0xffff8800502365f8
},
b_assoc_map = 0x0,
b_count = {
counter = 3
}
}
crash> enum bh_state_bits
enum bh_state_bits {
BH_Uptodate = 0 *
BH_Dirty = 1
BH_Lock = 2 * <--- the bit we are waiting on
BH_Req = 3 *
BH_Uptodate_Lock = 4
BH_Mapped = 5 *
BH_New = 6
BH_Async_Read = 7
BH_Async_Write = 8
BH_Delay = 9
BH_Boundary = 10
BH_Write_EIO = 11
BH_Ordered = 12
BH_Eopnotsupp = 13
BH_Unwritten = 14
BH_Quiet = 15
BH_PrivateStart = 16 *
};
struct block_device ffff88007fa40000
bd_dev = 211812352, == /dev/xvda (correct)
I see the following possible theories:
1) nothing submits the buffer_head to the io queue
2) the buffer_head is submitted to the queue but nothing tries to
write it to xen
3) the write is done to xen but xen fails to complete it (many
possible reasons for this..)
4) xen writes the block but does not acknowledge it
5) xen acknowledges it but linux somehow does not manage to unlock the
buffer_head properly.
I tried reading the disk using
$ sudo dd if=/dev/xendisk/orbit_root bs=4096 skip=282844 count=1 2> /dev/null |strings -n5
9uLzb
=4,[/
1aC $
1I|rh_
U94!xe
[04Sy
.Kd,C"
T!bP`
`B3p!
Y:5SpQ
A^QA`J
u>4rJ
&3p(*@\
y(un
B2!d`
but this does not look like the postscript that is in the
buffer_head. Does this mean that the write never reached the disk? Or
am I misinterpreting the block numbers somehow?
Reply to: