[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#648811: DRBD+OCFS2: general protection fault: 0000 [#1] SMP



Hi,
I can confirm a bug.
I tried DRBD (two primaries) with OCFS2 too. It seems I hit a very
similar bug after I did a reboot renamed nodes and did mount ocfs2 by hand. Two
identical nodes boot1 and boot2 guests on libvirt/KVM (different irons):

Linux boot1 2.6.32-5-amd64 #1 SMP Sun May 6 04:00:17 UTC 2012 x86_64 GNU/Linux

i  linux-image-2.6.32-5-amd64        2.6.32-45                         Linux 2.6.32 for 64-bit PCs

Virtuals have configured ttyS0 console (both attached), so I cut&paste
what was on the consoles:

========================== boot1 ==============================

[ 9985.562785] general protection fault: 0000 [#1] SMP
[ 9985.563235] last sysfs file: /sys/fs/o2cb/interface_revision
[ 9985.563707] CPU 0
[ 9985.563893] Modules linked in: ocfs2 quota_tree drbd lru_cache cn nfsd exportfs nfs lockd fscache nfs_acl auth_rpcgss sunrpc ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs loop snd_pcm snd_timer snd soundcore snd_page_alloc psmouse virtio_balloon button serio_raw pcspkr processor evdev i2c_piix4 i2c_core ext4 mbcache jbd2 crc16 dm_mod ata_generic virtio_blk ata_piix uhci_hcd ehci_hcd 8139too libata floppy thermal thermal_sys virtio_pci virtio_ring virtio 8139cp mii usbcore nls_base scsi_mod [last unloaded: scsi_wait_scan]
[ 9985.566703] Pid: 0, comm: swapper Not tainted 2.6.32-5-amd64 #1 Bochs
[ 9985.566703] RIP: 0010:[<ffffffff8129eb2e>]  [<ffffffff8129eb2e>] fib_get_table+0x2b/0x3a
[ 9985.566703] RSP: 0018:ffff880001803bb8  EFLAGS: 00010286
[ 9985.566703] RAX: ffff880000000001 RBX: ffff880001803c20 RCX: ffff880001803c20
[ 9985.566703] RDX: c3f000ff53f000ff RSI: 00000000000000fe RDI: ffffffff816d35b0
[ 9985.566703] RBP: ffff880001803cd0 R08: ffffffff816d35b0 R09: 00000000011fa8c0
[ 9985.566703] R10: 0000000000000002 R11: 0000000000000000 R12: ffff88001f9d0480
[ 9985.566703] R13: 0000000000000000 R14: ffff880001803c20 R15: ffff88001f9d04f8
[ 9985.566703] FS:  0000000000000000(0000) GS:ffff880001800000(0000) knlGS:0000000000000000
[ 9985.566703] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[ 9985.566703] CR2: 00000000018c9808 CR3: 000000001dd98000 CR4: 00000000000006f0
[ 9985.566703] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 9985.566703] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 9985.566703] Process swapper (pid: 0, threadinfo ffffffff8142e000, task ffffffff814891f0)
[ 9985.566703] Stack:
[ 9985.566703]  ffffffff812a427e 0000000000000000 ffff88001f9db000 ffff880001803cd0
[ 9985.566703] <0> ffffffff8125e1b1 ffff8800182371a8 ffff880001803c90 0000000000000002
[ 9985.566703] <0> 00000000ffffffea 00000000011fa8c0 ffff8800181b1000 ffff880018057800
[ 9985.566703] Call Trace:
[ 9985.566703]  <IRQ>
[ 9985.566703]  [<ffffffff812a427e>] ? fib4_rule_action+0x35/0x54
[ 9985.566703]  [<ffffffff8125e1b1>] ? fib_rules_lookup+0x89/0xc3
[ 9985.566703]  [<ffffffff812a4474>] ? fib_lookup+0x2d/0x3d
[ 9985.566703]  [<ffffffff8103a311>] ? enqueue_task+0x5f/0x68
[ 9985.566703]  [<ffffffff81271d9f>] ? ip_route_input+0x59e/0xcbf
[ 9985.566703]  [<ffffffff8104a45a>] ? try_to_wake_up+0x289/0x29b
[ 9985.566703]  [<ffffffff8106504b>] ? autoremove_wake_function+0x9/0x2e
[ 9985.566703]  [<ffffffff8103aa06>] ? __wake_up_common+0x44/0x72
[ 9985.566703]  [<ffffffff810ad4e8>] ? cpupri_set+0x10c/0x135
[ 9985.566703]  [<ffffffff81295e3e>] ? arp_process+0x20f/0x60b
[ 9985.566703]  [<ffffffffa00737fb>] ? cp_rx_poll+0x2d8/0x3bd [8139cp]
[ 9985.566703]  [<ffffffff8125039f>] ? net_rx_action+0xae/0x1c9
[ 9985.566703]  [<ffffffff81053d6f>] ? __do_softirq+0xdd/0x1a6
[ 9985.566703]  [<ffffffff81011cac>] ? call_softirq+0x1c/0x30
[ 9985.566703]  [<ffffffff8101322b>] ? do_softirq+0x3f/0x7c
[ 9985.566703]  [<ffffffff81053bdf>] ? irq_exit+0x36/0x76
[ 9985.566703]  [<ffffffff81012922>] ? do_IRQ+0xa0/0xb6
[ 9985.566703]  [<ffffffff810114d3>] ? ret_from_intr+0x0/0x11
[ 9985.566703]  <EOI>
[ 9985.566703]  [<ffffffff8102c584>] ? native_safe_halt+0x2/0x3
[ 9985.566703]  [<ffffffff8101758d>] ? default_idle+0x34/0x51
[ 9985.566703]  [<ffffffff8100fe97>] ? cpu_idle+0xa2/0xda
[ 9985.566703]  [<ffffffff8151c140>] ? early_idt_handler+0x0/0x71
[ 9985.566703]  [<ffffffff8151ccdd>] ? start_kernel+0x3dc/0x3e8
[ 9985.566703]  [<ffffffff8151c3b7>] ? x86_64_start_kernel+0xf9/0x106
[ 9985.566703] Code: 85 f6 b8 fe 00 00 00 0f 44 f0 48 89 f0 83 e0 01 48 c1 e0 03 48 03 87 f0 01 00 00 48 8b 10 eb 03 48 8b 12 48 85 d2 75 03 31 c0 c3 <39> 72 10 48 8b 02 0f 18 08 48 89 d0 75 e7 c3 48 8b 42 10 48 85
[ 9985.566703] RIP  [<ffffffff8129eb2e>] fib_get_table+0x2b/0x3a
[ 9985.566703]  RSP <ffff880001803bb8>
[ 9985.594195] ---[ end trace 49c80a52371bee10 ]---
[ 9985.594581] Kernel panic - not syncing: Fatal exception in interrupt
[ 9985.595102] Pid: 0, comm: swapper Tainted: G      D    2.6.32-5-amd64 #1
[ 9985.595650] Call Trace:
[ 9985.595857]  <IRQ>  [<ffffffff812faf69>] ? panic+0x86/0x143
[ 9985.596356]  [<ffffffff81011673>] ? apic_timer_interrupt+0x13/0x20
[ 9985.596869]  [<ffffffff811b89f1>] ? vgacon_cursor+0x0/0x140
[ 9985.597342]  [<ffffffff812fdc62>] ? oops_end+0x64/0xb4
[ 9985.597764]  [<ffffffff812fdca5>] ? oops_end+0xa7/0xb4
[ 9985.598188]  [<ffffffff812fd155>] ? general_protection+0x25/0x30
[ 9985.598682]  [<ffffffff8129eb2e>] ? fib_get_table+0x2b/0x3a
[ 9985.599141]  [<ffffffff812a427e>] ? fib4_rule_action+0x35/0x54
[ 9985.599622]  [<ffffffff8125e1b1>] ? fib_rules_lookup+0x89/0xc3
[ 9985.600112]  [<ffffffff812a4474>] ? fib_lookup+0x2d/0x3d
[ 9985.600568]  [<ffffffff8103a311>] ? enqueue_task+0x5f/0x68
[ 9985.601017]  [<ffffffff81271d9f>] ? ip_route_input+0x59e/0xcbf
[ 9985.601498]  [<ffffffff8104a45a>] ? try_to_wake_up+0x289/0x29b
[ 9985.601980]  [<ffffffff8106504b>] ? autoremove_wake_function+0x9/0x2e
[ 9985.602522]  [<ffffffff8103aa06>] ? __wake_up_common+0x44/0x72
[ 9985.603011]  [<ffffffff810ad4e8>] ? cpupri_set+0x10c/0x135
[ 9985.603463]  [<ffffffff81295e3e>] ? arp_process+0x20f/0x60b
[ 9985.603925]  [<ffffffffa00737fb>] ? cp_rx_poll+0x2d8/0x3bd [8139cp]
[ 9985.604472]  [<ffffffff8125039f>] ? net_rx_action+0xae/0x1c9
[ 9985.604936]  [<ffffffff81053d6f>] ? __do_softirq+0xdd/0x1a6
[ 9985.605384]  [<ffffffff81011cac>] ? call_softirq+0x1c/0x30
[ 9985.605851]  [<ffffffff8101322b>] ? do_softirq+0x3f/0x7c
[ 9985.606290]  [<ffffffff81053bdf>] ? irq_exit+0x36/0x76
[ 9985.606709]  [<ffffffff81012922>] ? do_IRQ+0xa0/0xb6
[ 9985.607114]  [<ffffffff810114d3>] ? ret_from_intr+0x0/0x11
[ 9985.607559]  <EOI>  [<ffffffff8102c584>] ? native_safe_halt+0x2/0x3
[ 9985.608094]  [<ffffffff8101758d>] ? default_idle+0x34/0x51
[ 9985.608561]  [<ffffffff8100fe97>] ? cpu_idle+0xa2/0xda
[ 9985.608986]  [<ffffffff8151c140>] ? early_idt_handler+0x0/0x71
[ 9985.609465]  [<ffffffff8151ccdd>] ? start_kernel+0x3dc/0x3e8
[ 9985.609929]  [<ffffffff8151c3b7>] ? x86_64_start_kernel+0xf9/0x106


----- last messages in syslog -----

Sep 17 11:52:15 boot1 kernel: [ 9944.833721] OCFS2 1.5.0
Sep 17 11:52:15 boot1 kernel: [ 9944.835069] ocfs2_dlm: Nodes in domain ("612BC4D1190E45C9988476ECE94A89D3"): 0 
Sep 17 11:52:15 boot1 kernel: [ 9944.840480] ocfs2: Mounting device (147,0) on (node 0, slot 0) with ordered data mode.
Sep 17 11:52:15 boot1 kernel: [ 9944.840759] (4855,0):ocfs2_replay_journal:1607 Recovering node 1 from slot 1 on device (147,0)
Sep 17 11:52:16 boot1 kernel: [ 9946.073028] (4855,0):ocfs2_begin_quota_recovery:376 Beginning quota recovery in slot 1
Sep 17 11:52:16 boot1 kernel: [ 9946.078288] (4847,0):ocfs2_finish_quota_recovery:569 Finishing quota recovery in slot 1
Sep 17 11:52:31 boot1 kernel: [ 9961.377383] o2net: accepted connection from node boot2 (num 1) at 192.168.31.52:7777
Sep 17 11:52:35 boot1 kernel: [ 9965.413269] ocfs2_dlm: Node 1 joins domain 612BC4D1190E45C9988476ECE94A89D3
Sep 17 11:52:35 boot1 kernel: [ 9965.413272] ocfs2_dlm: Nodes in domain ("612BC4D1190E45C9988476ECE94A89D3"): 0 1 


========================== boot2 ==============================

[ 9529.404113] block drbd0: PingAck did not arrive in time.
[ 9529.404637] block drbd0: short read expecting header on sock: r=-512
[ 9577.212104] (833,0):o2net_connect_expired:1656 ERROR: no connection established with node 0 after 30.0 seconds, giving up and returning errors.
[ 9591.380104] general protection fault: 0000 [#1] SMP
[ 9591.380576] last sysfs file: /sys/fs/o2cb/interface_revision
[ 9591.381047] CPU 0
[ 9591.381226] Modules linked in: ocfs2 quota_tree drbd lru_cache cn nfsd exportfs nfs lockd fscache nfs_acl auth_rpcgss sunrpc ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs loop psmouse snd_pcm snd_timer snd soundcore snd_page_alloc i2c_piix4 i2c_core pcspkr serio_raw virtio_balloon processor evdev button ext4 mbcache jbd2 crc16 dm_mod ata_generic virtio_blk ata_piix uhci_hcd ehci_hcd 8139too libata floppy thermal thermal_sys virtio_pci virtio_ring virtio 8139cp mii usbcore nls_base scsi_mod [last unloaded: scsi_wait_scan]
[ 9591.384017] Pid: 0, comm: swapper Not tainted 2.6.32-5-amd64 #1 Bochs
[ 9591.384017] RIP: 0010:[<ffffffff8129eb2e>]  [<ffffffff8129eb2e>] fib_get_table+0x2b/0x3a
[ 9591.384017] RSP: 0018:ffff880001803a28  EFLAGS: 00010286
[ 9591.384017] RAX: ffff880000000000 RBX: ffff880001803a90 RCX: ffff880001803a90
[ 9591.384017] RDX: f000ff53f000ff53 RSI: 00000000000000fe RDI: ffffffff816d35b0
[ 9591.384017] RBP: ffff880001803b00 R08: ffffffff816d35b0 R09: ffff88001f967a48
[ 9591.384017] R10: ffff880017a38740 R11: 0000000000000000 R12: ffff88001f9d0480
[ 9591.384017] R13: 0000000000000000 R14: ffff880001803a90 R15: ffff88001f9d04f8
[ 9591.384017] FS:  0000000000000000(0000) GS:ffff880001800000(0000) knlGS:0000000000000000
[ 9591.384017] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[ 9591.384017] CR2: 0000000001827ff0 CR3: 000000001fb7c000 CR4: 00000000000006f0
[ 9591.384017] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 9591.384017] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 9591.384017] Process swapper (pid: 0, threadinfo ffffffff8142e000, task ffffffff814891f0)
[ 9591.384017] Stack:
[ 9591.384017]  ffffffff812a427e ffff880017901000 ffff88001f9db000 ffff880001803b00
[ 9591.384017] <0> ffffffff8125e1b1 0000000000000000 ffff880001803ae0 ffff880001803b00
[ 9591.384017] <0> 0000000000000000 ffffffff816d35b0 ffff880001803c30 ffff880017878000
[ 9591.384017] Call Trace:
[ 9591.384017]  <IRQ>
[ 9591.384017]  [<ffffffff812a427e>] ? fib4_rule_action+0x35/0x54
[ 9591.384017]  [<ffffffff8125e1b1>] ? fib_rules_lookup+0x89/0xc3
[ 9591.384017]  [<ffffffff812a4474>] ? fib_lookup+0x2d/0x3d
[ 9591.384017]  [<ffffffff8127108c>] ? __ip_route_output_key+0x38b/0x8a6
[ 9591.384017]  [<ffffffff812715c9>] ? ip_route_output_flow+0x22/0x1cb
[ 9591.384017]  [<ffffffff8129a56f>] ? inet_sk_rebuild_header+0x126/0x358
[ 9591.384017]  [<ffffffff810e60a1>] ? virt_to_head_page+0x9/0x2a
[ 9591.384017]  [<ffffffff81275ddc>] ? ip_cork_release+0x2e/0x3b
[ 9591.384017]  [<ffffffff81289e95>] ? tcp_retransmit_skb+0x91/0x5aa
[ 9591.384017]  [<ffffffff8126e9e0>] ? rt_del+0x7d/0xa4
[ 9591.384017]  [<ffffffff8128b7db>] ? tcp_retransmit_timer+0x46f/0x56b
[ 9591.384017]  [<ffffffff8128f159>] ? tcp_v4_err+0x352/0x4cb
[ 9591.384017]  [<ffffffff812973fd>] ? icmp_rcv+0x1ea/0x220
[ 9591.384017]  [<ffffffff81273c40>] ? ip_local_deliver_finish+0x146/0x1e9
[ 9591.384017]  [<ffffffff8127378f>] ? ip_rcv_finish+0x373/0x38d
[ 9591.384017]  [<ffffffff8124fdb9>] ? process_backlog+0x81/0xb4
[ 9591.384017]  [<ffffffff8106938f>] ? sched_clock_local+0x13/0x74
[ 9591.384017]  [<ffffffff8125039f>] ? net_rx_action+0xae/0x1c9
[ 9591.384017]  [<ffffffff81053d6f>] ? __do_softirq+0xdd/0x1a6
[ 9591.384017]  [<ffffffff8102462a>] ? lapic_next_event+0x18/0x1d
[ 9591.384017]  [<ffffffff81011cac>] ? call_softirq+0x1c/0x30
[ 9591.384017]  [<ffffffff8101322b>] ? do_softirq+0x3f/0x7c
[ 9591.384017]  [<ffffffff81053bdf>] ? irq_exit+0x36/0x76
[ 9591.384017]  [<ffffffff810250f8>] ? smp_apic_timer_interrupt+0x87/0x95
[ 9591.384017]  [<ffffffff81011673>] ? apic_timer_interrupt+0x13/0x20
[ 9591.384017]  <EOI>
[ 9591.384017]  [<ffffffff8102c584>] ? native_safe_halt+0x2/0x3
[ 9591.384017]  [<ffffffff8101758d>] ? default_idle+0x34/0x51
[ 9591.384017]  [<ffffffff8100fe97>] ? cpu_idle+0xa2/0xda
[ 9591.384017]  [<ffffffff8151c140>] ? early_idt_handler+0x0/0x71
[ 9591.384017]  [<ffffffff8151ccdd>] ? start_kernel+0x3dc/0x3e8
[ 9591.384017]  [<ffffffff8151c3b7>] ? x86_64_start_kernel+0xf9/0x106
[ 9591.384017] Code: 85 f6 b8 fe 00 00 00 0f 44 f0 48 89 f0 83 e0 01 48 c1 e0 03 48 03 87 f0 01 00 00 48 8b 10 eb 03 48 8b 12 48 85 d2 75 03 31 c0 c3 <39> 72 10 48 8b 02 0f 18 08 48 89 d0 75 e7 c3 48 8b 42 10 48 85
[ 9591.384017] RIP  [<ffffffff8129eb2e>] fib_get_table+0x2b/0x3a
[ 9591.384017]  RSP <ffff880001803a28>
[ 9591.415967] ---[ end trace 392e24ab78cc83a4 ]---
[ 9591.416395] Kernel panic - not syncing: Fatal exception in interrupt
[ 9591.416969] Pid: 0, comm: swapper Tainted: G      D    2.6.32-5-amd64 #1
[ 9591.417546] Call Trace:
[ 9591.417755]  <IRQ>  [<ffffffff812faf69>] ? panic+0x86/0x143
[ 9591.418260]  [<ffffffff81011673>] ? apic_timer_interrupt+0x13/0x20
[ 9591.418807]  [<ffffffff811b89f1>] ? vgacon_cursor+0x0/0x140
[ 9591.419286]  [<ffffffff812fdc62>] ? oops_end+0x64/0xb4
[ 9591.419729]  [<ffffffff812fdca5>] ? oops_end+0xa7/0xb4
[ 9591.420190]  [<ffffffff812fd155>] ? general_protection+0x25/0x30
[ 9591.420738]  [<ffffffff8129eb2e>] ? fib_get_table+0x2b/0x3a
[ 9591.421210]  [<ffffffff812a427e>] ? fib4_rule_action+0x35/0x54
[ 9591.421713]  [<ffffffff8125e1b1>] ? fib_rules_lookup+0x89/0xc3
[ 9591.422212]  [<ffffffff812a4474>] ? fib_lookup+0x2d/0x3d
[ 9591.422689]  [<ffffffff8127108c>] ? __ip_route_output_key+0x38b/0x8a6
[ 9591.423220]  [<ffffffff812715c9>] ? ip_route_output_flow+0x22/0x1cb
[ 9591.423760]  [<ffffffff8129a56f>] ? inet_sk_rebuild_header+0x126/0x358
[ 9591.424339]  [<ffffffff810e60a1>] ? virt_to_head_page+0x9/0x2a
[ 9591.424875]  [<ffffffff81275ddc>] ? ip_cork_release+0x2e/0x3b
[ 9591.425350]  [<ffffffff81289e95>] ? tcp_retransmit_skb+0x91/0x5aa
[ 9591.425848]  [<ffffffff8126e9e0>] ? rt_del+0x7d/0xa4
[ 9591.426261]  [<ffffffff8128b7db>] ? tcp_retransmit_timer+0x46f/0x56b
[ 9591.426807]  [<ffffffff8128f159>] ? tcp_v4_err+0x352/0x4cb
[ 9591.427246]  [<ffffffff812973fd>] ? icmp_rcv+0x1ea/0x220
[ 9591.427694]  [<ffffffff81273c40>] ? ip_local_deliver_finish+0x146/0x1e9
[ 9591.428246]  [<ffffffff8127378f>] ? ip_rcv_finish+0x373/0x38d
[ 9591.428770]  [<ffffffff8124fdb9>] ? process_backlog+0x81/0xb4
[ 9591.429250]  [<ffffffff8106938f>] ? sched_clock_local+0x13/0x74
[ 9591.429763]  [<ffffffff8125039f>] ? net_rx_action+0xae/0x1c9
[ 9591.430344]  [<ffffffff81053d6f>] ? __do_softirq+0xdd/0x1a6
[ 9591.430826]  [<ffffffff8102462a>] ? lapic_next_event+0x18/0x1d
[ 9591.431310]  [<ffffffff81011cac>] ? call_softirq+0x1c/0x30
[ 9591.431790]  [<ffffffff8101322b>] ? do_softirq+0x3f/0x7c
[ 9591.432261]  [<ffffffff81053bdf>] ? irq_exit+0x36/0x76
[ 9591.432747]  [<ffffffff810250f8>] ? smp_apic_timer_interrupt+0x87/0x95
[ 9591.433280]  [<ffffffff81011673>] ? apic_timer_interrupt+0x13/0x20
[ 9591.433797]  <EOI>  [<ffffffff8102c584>] ? native_safe_halt+0x2/0x3
[ 9591.434336]  [<ffffffff8101758d>] ? default_idle+0x34/0x51
[ 9591.434805]  [<ffffffff8100fe97>] ? cpu_idle+0xa2/0xda
[ 9591.435221]  [<ffffffff8151c140>] ? early_idt_handler+0x0/0x71
[ 9591.435712]  [<ffffffff8151ccdd>] ? start_kernel+0x3dc/0x3e8
[ 9591.436191]  [<ffffffff8151c3b7>] ? x86_64_start_kernel+0xf9/0x106


----- last messages in syslog -----

Sep 17 11:52:32 boot2 kernel: [ 9494.857236] o2net: connected to node boot1 (num 0) at 192.168.31.51:7777
Sep 17 11:52:36 boot2 kernel: [ 9498.890803] OCFS2 1.5.0
Sep 17 11:52:36 boot2 kernel: [ 9498.893072] ocfs2_dlm: Nodes in domain ("612BC4D1190E45C9988476ECE94A89D3"): 0 1
Sep 17 11:52:36 boot2 kernel: [ 9498.903794] ocfs2: Mounting device (147,0) on (node 1, slot 1) with ordered data mode.
Sep 17 11:53:06 boot2 kernel: [ 9529.404113] block drbd0: PingAck did not arrive in time.
Sep 17 11:53:06 boot2 kernel: [ 9529.404614] block drbd0: peer( Primary -> Unknown ) conn( Connected -> NetworkFailure ) pdsk( UpToDate -> DUnknown )
Sep 17 11:53:06 boot2 kernel: [ 9529.404621] block drbd0: asender terminated
Sep 17 11:53:06 boot2 kernel: [ 9529.404622] block drbd0: Terminating drbd0_asender
Sep 17 11:53:06 boot2 kernel: [ 9529.404637] block drbd0: short read expecting header on sock: r=-512
Sep 17 11:53:06 boot2 kernel: [ 9529.405213] block drbd0: Creating new current UUID
Sep 17 11:53:06 boot2 kernel: [ 9529.405625] block drbd0: Connection closed
Sep 17 11:53:06 boot2 kernel: [ 9529.405629] block drbd0: conn( NetworkFailure -> Unconnected )
Sep 17 11:53:06 boot2 kernel: [ 9529.405632] block drbd0: receiver terminated
Sep 17 11:53:06 boot2 kernel: [ 9529.405633] block drbd0: Restarting drbd0_receiver
Sep 17 11:53:06 boot2 kernel: [ 9529.405634] block drbd0: receiver (re)started
Sep 17 11:53:06 boot2 kernel: [ 9529.405645] block drbd0: conn( Unconnected -> WFConnection )
Sep 17 11:53:24 boot2 kernel: [ 9547.212045] o2net: connection to node boot1 (num 0) at 192.168.31.51:7777 has been idle for 30.0 seconds, shutting it down.
Sep 17 11:53:24 boot2 kernel: [ 9547.212055] (0,0):o2net_idle_timer:1495 here are some times that might help debug the situation: (tmr 1347875574.463110 now 1347875604.462549 dr 1347875574.463065 adv 1347875574.463110:1347875574.463111 func (343e2184:505) 1347875556.162729:1347875556.162730)
Sep 17 11:53:24 boot2 kernel: [ 9547.212158] o2net: no longer connected to node boot1 (num 0) at 192.168.31.51:7777
Sep 17 11:53:54 boot2 kernel: [ 9577.212104] (833,0):o2net_connect_expired:1656 ERROR: no connection established with node 0 after 30.0 seconds, giving up and returning errors.


Kindly Regards
-- 
Zito
global {
	usage-count yes;
	# minor-count dialog-refresh disable-ip-verification
}

common {
	protocol C;

	handlers {
		pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
		pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
		local-io-error "/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f";
		# fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
		# split-brain "/usr/lib/drbd/notify-split-brain.sh root";
		# out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
		# before-resync-target "/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
		# after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
	}

	startup {
		# wfc-timeout degr-wfc-timeout outdated-wfc-timeout wait-after-sb;
		wfc-timeout 180;
		degr-wfc-timeout 180;
	}

	disk {
		# on-io-error fencing use-bmbv no-disk-barrier no-disk-flushes
		# no-disk-drain no-md-flushes max-bio-bvecs   
		on-io-error call-local-io-error;
	}

	net {
		# snd‐buf-size rcvbuf-size timeout connect-int ping-int ping-timeout max-buffers
		# max-epoch-size ko-count allow-two-primaries cram-hmac-alg shared-secret
		# after-sb-0pri after-sb-1pri after-sb-2pri data-integrity-alg no-tcp-cork
	}

	syncer {
		# rate after al-extents use-rle cpu-mask verify-alg csums-alg
		rate	20M;
	}
}
resource srv {

	meta-disk internal;
	device minor 0;

	on boot1 {
		disk /dev/boot1/srv;
		address 192.168.31.51:7788;
	}

	on boot2 {
		disk /dev/boot2/srv;
		address 192.168.31.52:7788;
	}

	net {
		allow-two-primaries;
		after-sb-0pri discard-zero-changes;
		after-sb-1pri discard-secondary;
		after-sb-2pri disconnect;
	}

	startup {
		become-primary-on both;
	}

}
node:
	ip_port = 7777
	ip_address = 192.168.31.51
	number = 0
	name = boot1
	cluster = ocfs2

node:
	ip_port = 7777
	ip_address = 192.168.31.52
	number = 1
	name = boot2
	cluster = ocfs2

cluster:
	node_count = 2
	name = ocfs2

Reply to: