[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#410807: Patch for "kernel BUG at drivers/xen/core/evtchn.c:481"



Hi,

I believe this issue is fixed by
http://xenbits.xensource.com/xen-unstable.hg?rev/914304b3a3da

Without the patch (2.6.18.dfsg.1-18etch1) I was able to reproduce the
problem by repeatedly starting 4 guest domains in parallel[0] in a
around 5-6 hours or 288 iterations (first time I repro'd it was a couple
of hours, I wasn't counting time or iterations that time).

With the patch applied on top of the svn kernel
(2.6.18.dfsg.1-21.0.hellion0) the test survived overnight, around 16
hours and 850 iterations.

I've attached a version of this backported to the Etch kernel (some of
the function names differ). I also attached the svn diff-of-a-diff
against revision in r11588, 2.6.18.dfsg.1-22.

This issue is already fixed in the Lenny/Sid kernels which use the
paravirt_ops kernel port which has completely reimplemented event
channel code.

Cheers,
Ian.

[0] n=0 ; while : ; do date ; echo iteration $n ; xm create
debian-x86_32p-1 & xm create debian-x86_32p-2 & xm create
rhel44-x86_32p-1 kernel=/boot/vmlinuz-2.6.18.8-x86_32p-xenU ramdisk= &
xm create rhel41-x86_32p-1 kernel=/boot/vmlinuz-2.6.18.8-x86_32p-xenU
ramdisk= & sleep 1m ; xm dest debian-1 ; xm dest debian-2 ; xm dest
rhel41-1 ; xm dest rhel44-1 ; sleep 5s ; n=$(($n + 1)) ; done

-- 
Ian Campbell

I haven't lost my mind; I know exactly where I left it.
Index: debian/patches/series/22-extra
===================================================================
--- debian/patches/series/22-extra	(revision 0)
+++ debian/patches/series/22-extra	(revision 0)
@@ -0,0 +1 @@
++ features/all/xen/evtchn-BUG.patch *_xen *_xen-vserver
Index: debian/patches/features/all/xen/evtchn-BUG.patch
===================================================================
--- debian/patches/features/all/xen/evtchn-BUG.patch	(revision 0)
+++ debian/patches/features/all/xen/evtchn-BUG.patch	(revision 0)
@@ -0,0 +1,128 @@
+# HG changeset patch
+# User kfraser@localhost.localdomain
+# Date 1169559560 0
+# Node ID 914304b3a3da6fc5bad12f742bae3893b53d20bc
+# Parent  ee7c422c5f7b79e0cc0ae5670af81b400a72357f
+linux: Fix enable_irq() crash by removing a BUG_ON() assumption in our
+event-channel retrigger() function. Also clean up bitmap usages.
+Signed-off-by: Keir Fraser <keir@xensource.com>
+
+Index: source-amd64-xen/drivers/xen/core/evtchn.c
+===================================================================
+--- source-amd64-xen.orig/drivers/xen/core/evtchn.c	2008-06-03 21:11:44.000000000 +0100
++++ source-amd64-xen/drivers/xen/core/evtchn.c	2008-06-03 21:21:24.000000000 +0100
+@@ -103,7 +103,7 @@
+ static int irq_bindcount[NR_IRQS];
+ 
+ /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
+-static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)];
++static DECLARE_BITMAP(pirq_needs_eoi, NR_PIRQS);
+ 
+ #ifdef CONFIG_SMP
+ 
+@@ -472,14 +472,19 @@
+ 	rebind_irq_to_cpu(irq, tcpu);
+ }
+ 
+-static int retrigger(unsigned int irq)
++static int resend_irq_on_evtchn(unsigned int i)
+ {
+-	int evtchn = evtchn_from_irq(irq);
++	int masked, evtchn = evtchn_from_irq(i);
+ 	shared_info_t *s = HYPERVISOR_shared_info;
++
+ 	if (!VALID_EVTCHN(evtchn))
+ 		return 1;
+-	BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
+-	synch_set_bit(evtchn, &s->evtchn_pending[0]);
++
++	masked = synch_test_and_set_bit(evtchn, s->evtchn_mask);
++	synch_set_bit(evtchn, s->evtchn_pending);
++	if (!masked)
++		unmask_evtchn(evtchn);
++
+ 	return 1;
+ }
+ 
+@@ -551,13 +556,13 @@
+ #ifdef CONFIG_SMP
+ 	.set_affinity	= set_affinity_irq,
+ #endif
+-	.retrigger	= retrigger,
++	.retrigger	= resend_irq_on_evtchn,
+ };
+ 
+ static inline void pirq_unmask_notify(int pirq)
+ {
+ 	struct physdev_eoi eoi = { .irq = pirq };
+-	if (unlikely(test_bit(pirq, &pirq_needs_eoi[0])))
++	if (unlikely(test_bit(pirq, pirq_needs_eoi)))
+ 		(void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+ }
+ 
+@@ -679,7 +684,7 @@
+ #ifdef CONFIG_SMP
+ 	.set_affinity	= set_affinity_irq,
+ #endif
+-	.retrigger	= retrigger,
++	.retrigger	= resend_irq_on_evtchn,
+ };
+ 
+ int irq_ignore_unhandled(unsigned int irq)
+@@ -693,16 +698,6 @@
+ 	return !!(irq_status.flags & XENIRQSTAT_shared);
+ }
+ 
+-void resend_irq_on_evtchn(unsigned int i)
+-{
+-	int evtchn = evtchn_from_irq(i);
+-	shared_info_t *s = HYPERVISOR_shared_info;
+-	if (!VALID_EVTCHN(evtchn))
+-		return;
+-	BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
+-	synch_set_bit(evtchn, &s->evtchn_pending[0]);
+-}
+-
+ void notify_remote_via_irq(int irq)
+ {
+ 	int evtchn = evtchn_from_irq(irq);
+@@ -715,7 +710,7 @@
+ void mask_evtchn(int port)
+ {
+ 	shared_info_t *s = HYPERVISOR_shared_info;
+-	synch_set_bit(port, &s->evtchn_mask[0]);
++	synch_set_bit(port, s->evtchn_mask);
+ }
+ EXPORT_SYMBOL_GPL(mask_evtchn);
+ 
+@@ -734,14 +729,10 @@
+ 		return;
+ 	}
+ 
+-	synch_clear_bit(port, &s->evtchn_mask[0]);
++	synch_clear_bit(port, s->evtchn_mask);
+ 
+-	/*
+-	 * The following is basically the equivalent of 'hw_resend_irq'. Just
+-	 * like a real IO-APIC we 'lose the interrupt edge' if the channel is
+-	 * masked.
+-	 */
+-	if (synch_test_bit(port, &s->evtchn_pending[0]) &&
++	/* Did we miss an interrupt 'edge'? Re-fire if so. */
++	if (synch_test_bit(port, s->evtchn_pending) &&
+ 	    !synch_test_and_set_bit(port / BITS_PER_LONG,
+ 				    &vcpu_info->evtchn_pending_sel))
+ 		vcpu_info->evtchn_upcall_pending = 1;
+Index: source-amd64-xen/include/xen/evtchn.h
+===================================================================
+--- source-amd64-xen.orig/include/xen/evtchn.h	2008-06-03 21:11:48.000000000 +0100
++++ source-amd64-xen/include/xen/evtchn.h	2008-06-03 21:18:30.000000000 +0100
+@@ -95,7 +95,7 @@
+ static inline void clear_evtchn(int port)
+ {
+ 	shared_info_t *s = HYPERVISOR_shared_info;
+-	synch_clear_bit(port, &s->evtchn_pending[0]);
++	synch_clear_bit(port, s->evtchn_pending);
+ }
+ 
+ static inline void notify_remote_via_evtchn(int port)
Index: debian/changelog
===================================================================
--- debian/changelog	(revision 11588)
+++ debian/changelog	(working copy)
@@ -1,7 +1,12 @@
 linux-2.6 (2.6.18.dfsg.1-22) UNRELEASED; urgency=high
 
+  [ dann frazier ]
   * Merge in changes from 2.6.18.dfsg.1-18etch6
 
+  [ Ian Campbell ]
+  * Backport http://xenbits.xensource.com/xen-unstable.hg?rev/914304b3a3da,
+    fixing kernel BUG at drivers/xen/core/evtchn.c:481 (closes: 410807).
+
  -- dann frazier <dannf@debian.org>  Mon, 09 Jun 2008 01:21:13 -0600
 
 linux-2.6 (2.6.18.dfsg.1-21) stable; urgency=high
# HG changeset patch
# User kfraser@localhost.localdomain
# Date 1169559560 0
# Node ID 914304b3a3da6fc5bad12f742bae3893b53d20bc
# Parent  ee7c422c5f7b79e0cc0ae5670af81b400a72357f
linux: Fix enable_irq() crash by removing a BUG_ON() assumption in our
event-channel retrigger() function. Also clean up bitmap usages.
Signed-off-by: Keir Fraser <keir@xensource.com>

Index: source-amd64-xen/drivers/xen/core/evtchn.c
===================================================================
--- source-amd64-xen.orig/drivers/xen/core/evtchn.c	2008-06-03 21:11:44.000000000 +0100
+++ source-amd64-xen/drivers/xen/core/evtchn.c	2008-06-03 21:21:24.000000000 +0100
@@ -103,7 +103,7 @@
 static int irq_bindcount[NR_IRQS];
 
 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
-static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)];
+static DECLARE_BITMAP(pirq_needs_eoi, NR_PIRQS);
 
 #ifdef CONFIG_SMP
 
@@ -472,14 +472,19 @@
 	rebind_irq_to_cpu(irq, tcpu);
 }
 
-static int retrigger(unsigned int irq)
+static int resend_irq_on_evtchn(unsigned int i)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int masked, evtchn = evtchn_from_irq(i);
 	shared_info_t *s = HYPERVISOR_shared_info;
+
 	if (!VALID_EVTCHN(evtchn))
 		return 1;
-	BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
-	synch_set_bit(evtchn, &s->evtchn_pending[0]);
+
+	masked = synch_test_and_set_bit(evtchn, s->evtchn_mask);
+	synch_set_bit(evtchn, s->evtchn_pending);
+	if (!masked)
+		unmask_evtchn(evtchn);
+
 	return 1;
 }
 
@@ -551,13 +556,13 @@
 #ifdef CONFIG_SMP
 	.set_affinity	= set_affinity_irq,
 #endif
-	.retrigger	= retrigger,
+	.retrigger	= resend_irq_on_evtchn,
 };
 
 static inline void pirq_unmask_notify(int pirq)
 {
 	struct physdev_eoi eoi = { .irq = pirq };
-	if (unlikely(test_bit(pirq, &pirq_needs_eoi[0])))
+	if (unlikely(test_bit(pirq, pirq_needs_eoi)))
 		(void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
 }
 
@@ -679,7 +684,7 @@
 #ifdef CONFIG_SMP
 	.set_affinity	= set_affinity_irq,
 #endif
-	.retrigger	= retrigger,
+	.retrigger	= resend_irq_on_evtchn,
 };
 
 int irq_ignore_unhandled(unsigned int irq)
@@ -693,16 +698,6 @@
 	return !!(irq_status.flags & XENIRQSTAT_shared);
 }
 
-void resend_irq_on_evtchn(unsigned int i)
-{
-	int evtchn = evtchn_from_irq(i);
-	shared_info_t *s = HYPERVISOR_shared_info;
-	if (!VALID_EVTCHN(evtchn))
-		return;
-	BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
-	synch_set_bit(evtchn, &s->evtchn_pending[0]);
-}
-
 void notify_remote_via_irq(int irq)
 {
 	int evtchn = evtchn_from_irq(irq);
@@ -715,7 +710,7 @@
 void mask_evtchn(int port)
 {
 	shared_info_t *s = HYPERVISOR_shared_info;
-	synch_set_bit(port, &s->evtchn_mask[0]);
+	synch_set_bit(port, s->evtchn_mask);
 }
 EXPORT_SYMBOL_GPL(mask_evtchn);
 
@@ -734,14 +729,10 @@
 		return;
 	}
 
-	synch_clear_bit(port, &s->evtchn_mask[0]);
+	synch_clear_bit(port, s->evtchn_mask);
 
-	/*
-	 * The following is basically the equivalent of 'hw_resend_irq'. Just
-	 * like a real IO-APIC we 'lose the interrupt edge' if the channel is
-	 * masked.
-	 */
-	if (synch_test_bit(port, &s->evtchn_pending[0]) &&
+	/* Did we miss an interrupt 'edge'? Re-fire if so. */
+	if (synch_test_bit(port, s->evtchn_pending) &&
 	    !synch_test_and_set_bit(port / BITS_PER_LONG,
 				    &vcpu_info->evtchn_pending_sel))
 		vcpu_info->evtchn_upcall_pending = 1;
Index: source-amd64-xen/include/xen/evtchn.h
===================================================================
--- source-amd64-xen.orig/include/xen/evtchn.h	2008-06-03 21:11:48.000000000 +0100
+++ source-amd64-xen/include/xen/evtchn.h	2008-06-03 21:18:30.000000000 +0100
@@ -95,7 +95,7 @@
 static inline void clear_evtchn(int port)
 {
 	shared_info_t *s = HYPERVISOR_shared_info;
-	synch_clear_bit(port, &s->evtchn_pending[0]);
+	synch_clear_bit(port, s->evtchn_pending);
 }
 
 static inline void notify_remote_via_evtchn(int port)

Attachment: signature.asc
Description: This is a digitally signed message part


Reply to: