Hi, I believe this issue is fixed by http://xenbits.xensource.com/xen-unstable.hg?rev/914304b3a3da Without the patch (2.6.18.dfsg.1-18etch1) I was able to reproduce the problem by repeatedly starting 4 guest domains in parallel[0] in a around 5-6 hours or 288 iterations (first time I repro'd it was a couple of hours, I wasn't counting time or iterations that time). With the patch applied on top of the svn kernel (2.6.18.dfsg.1-21.0.hellion0) the test survived overnight, around 16 hours and 850 iterations. I've attached a version of this backported to the Etch kernel (some of the function names differ). I also attached the svn diff-of-a-diff against revision in r11588, 2.6.18.dfsg.1-22. This issue is already fixed in the Lenny/Sid kernels which use the paravirt_ops kernel port which has completely reimplemented event channel code. Cheers, Ian. [0] n=0 ; while : ; do date ; echo iteration $n ; xm create debian-x86_32p-1 & xm create debian-x86_32p-2 & xm create rhel44-x86_32p-1 kernel=/boot/vmlinuz-2.6.18.8-x86_32p-xenU ramdisk= & xm create rhel41-x86_32p-1 kernel=/boot/vmlinuz-2.6.18.8-x86_32p-xenU ramdisk= & sleep 1m ; xm dest debian-1 ; xm dest debian-2 ; xm dest rhel41-1 ; xm dest rhel44-1 ; sleep 5s ; n=$(($n + 1)) ; done -- Ian Campbell I haven't lost my mind; I know exactly where I left it.
Index: debian/patches/series/22-extra
===================================================================
--- debian/patches/series/22-extra (revision 0)
+++ debian/patches/series/22-extra (revision 0)
@@ -0,0 +1 @@
++ features/all/xen/evtchn-BUG.patch *_xen *_xen-vserver
Index: debian/patches/features/all/xen/evtchn-BUG.patch
===================================================================
--- debian/patches/features/all/xen/evtchn-BUG.patch (revision 0)
+++ debian/patches/features/all/xen/evtchn-BUG.patch (revision 0)
@@ -0,0 +1,128 @@
+# HG changeset patch
+# User kfraser@localhost.localdomain
+# Date 1169559560 0
+# Node ID 914304b3a3da6fc5bad12f742bae3893b53d20bc
+# Parent ee7c422c5f7b79e0cc0ae5670af81b400a72357f
+linux: Fix enable_irq() crash by removing a BUG_ON() assumption in our
+event-channel retrigger() function. Also clean up bitmap usages.
+Signed-off-by: Keir Fraser <keir@xensource.com>
+
+Index: source-amd64-xen/drivers/xen/core/evtchn.c
+===================================================================
+--- source-amd64-xen.orig/drivers/xen/core/evtchn.c 2008-06-03 21:11:44.000000000 +0100
++++ source-amd64-xen/drivers/xen/core/evtchn.c 2008-06-03 21:21:24.000000000 +0100
+@@ -103,7 +103,7 @@
+ static int irq_bindcount[NR_IRQS];
+
+ /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
+-static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)];
++static DECLARE_BITMAP(pirq_needs_eoi, NR_PIRQS);
+
+ #ifdef CONFIG_SMP
+
+@@ -472,14 +472,19 @@
+ rebind_irq_to_cpu(irq, tcpu);
+ }
+
+-static int retrigger(unsigned int irq)
++static int resend_irq_on_evtchn(unsigned int i)
+ {
+- int evtchn = evtchn_from_irq(irq);
++ int masked, evtchn = evtchn_from_irq(i);
+ shared_info_t *s = HYPERVISOR_shared_info;
++
+ if (!VALID_EVTCHN(evtchn))
+ return 1;
+- BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
+- synch_set_bit(evtchn, &s->evtchn_pending[0]);
++
++ masked = synch_test_and_set_bit(evtchn, s->evtchn_mask);
++ synch_set_bit(evtchn, s->evtchn_pending);
++ if (!masked)
++ unmask_evtchn(evtchn);
++
+ return 1;
+ }
+
+@@ -551,13 +556,13 @@
+ #ifdef CONFIG_SMP
+ .set_affinity = set_affinity_irq,
+ #endif
+- .retrigger = retrigger,
++ .retrigger = resend_irq_on_evtchn,
+ };
+
+ static inline void pirq_unmask_notify(int pirq)
+ {
+ struct physdev_eoi eoi = { .irq = pirq };
+- if (unlikely(test_bit(pirq, &pirq_needs_eoi[0])))
++ if (unlikely(test_bit(pirq, pirq_needs_eoi)))
+ (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+ }
+
+@@ -679,7 +684,7 @@
+ #ifdef CONFIG_SMP
+ .set_affinity = set_affinity_irq,
+ #endif
+- .retrigger = retrigger,
++ .retrigger = resend_irq_on_evtchn,
+ };
+
+ int irq_ignore_unhandled(unsigned int irq)
+@@ -693,16 +698,6 @@
+ return !!(irq_status.flags & XENIRQSTAT_shared);
+ }
+
+-void resend_irq_on_evtchn(unsigned int i)
+-{
+- int evtchn = evtchn_from_irq(i);
+- shared_info_t *s = HYPERVISOR_shared_info;
+- if (!VALID_EVTCHN(evtchn))
+- return;
+- BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
+- synch_set_bit(evtchn, &s->evtchn_pending[0]);
+-}
+-
+ void notify_remote_via_irq(int irq)
+ {
+ int evtchn = evtchn_from_irq(irq);
+@@ -715,7 +710,7 @@
+ void mask_evtchn(int port)
+ {
+ shared_info_t *s = HYPERVISOR_shared_info;
+- synch_set_bit(port, &s->evtchn_mask[0]);
++ synch_set_bit(port, s->evtchn_mask);
+ }
+ EXPORT_SYMBOL_GPL(mask_evtchn);
+
+@@ -734,14 +729,10 @@
+ return;
+ }
+
+- synch_clear_bit(port, &s->evtchn_mask[0]);
++ synch_clear_bit(port, s->evtchn_mask);
+
+- /*
+- * The following is basically the equivalent of 'hw_resend_irq'. Just
+- * like a real IO-APIC we 'lose the interrupt edge' if the channel is
+- * masked.
+- */
+- if (synch_test_bit(port, &s->evtchn_pending[0]) &&
++ /* Did we miss an interrupt 'edge'? Re-fire if so. */
++ if (synch_test_bit(port, s->evtchn_pending) &&
+ !synch_test_and_set_bit(port / BITS_PER_LONG,
+ &vcpu_info->evtchn_pending_sel))
+ vcpu_info->evtchn_upcall_pending = 1;
+Index: source-amd64-xen/include/xen/evtchn.h
+===================================================================
+--- source-amd64-xen.orig/include/xen/evtchn.h 2008-06-03 21:11:48.000000000 +0100
++++ source-amd64-xen/include/xen/evtchn.h 2008-06-03 21:18:30.000000000 +0100
+@@ -95,7 +95,7 @@
+ static inline void clear_evtchn(int port)
+ {
+ shared_info_t *s = HYPERVISOR_shared_info;
+- synch_clear_bit(port, &s->evtchn_pending[0]);
++ synch_clear_bit(port, s->evtchn_pending);
+ }
+
+ static inline void notify_remote_via_evtchn(int port)
Index: debian/changelog
===================================================================
--- debian/changelog (revision 11588)
+++ debian/changelog (working copy)
@@ -1,7 +1,12 @@
linux-2.6 (2.6.18.dfsg.1-22) UNRELEASED; urgency=high
+ [ dann frazier ]
* Merge in changes from 2.6.18.dfsg.1-18etch6
+ [ Ian Campbell ]
+ * Backport http://xenbits.xensource.com/xen-unstable.hg?rev/914304b3a3da,
+ fixing kernel BUG at drivers/xen/core/evtchn.c:481 (closes: 410807).
+
-- dann frazier <dannf@debian.org> Mon, 09 Jun 2008 01:21:13 -0600
linux-2.6 (2.6.18.dfsg.1-21) stable; urgency=high
# HG changeset patch
# User kfraser@localhost.localdomain
# Date 1169559560 0
# Node ID 914304b3a3da6fc5bad12f742bae3893b53d20bc
# Parent ee7c422c5f7b79e0cc0ae5670af81b400a72357f
linux: Fix enable_irq() crash by removing a BUG_ON() assumption in our
event-channel retrigger() function. Also clean up bitmap usages.
Signed-off-by: Keir Fraser <keir@xensource.com>
Index: source-amd64-xen/drivers/xen/core/evtchn.c
===================================================================
--- source-amd64-xen.orig/drivers/xen/core/evtchn.c 2008-06-03 21:11:44.000000000 +0100
+++ source-amd64-xen/drivers/xen/core/evtchn.c 2008-06-03 21:21:24.000000000 +0100
@@ -103,7 +103,7 @@
static int irq_bindcount[NR_IRQS];
/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
-static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)];
+static DECLARE_BITMAP(pirq_needs_eoi, NR_PIRQS);
#ifdef CONFIG_SMP
@@ -472,14 +472,19 @@
rebind_irq_to_cpu(irq, tcpu);
}
-static int retrigger(unsigned int irq)
+static int resend_irq_on_evtchn(unsigned int i)
{
- int evtchn = evtchn_from_irq(irq);
+ int masked, evtchn = evtchn_from_irq(i);
shared_info_t *s = HYPERVISOR_shared_info;
+
if (!VALID_EVTCHN(evtchn))
return 1;
- BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
- synch_set_bit(evtchn, &s->evtchn_pending[0]);
+
+ masked = synch_test_and_set_bit(evtchn, s->evtchn_mask);
+ synch_set_bit(evtchn, s->evtchn_pending);
+ if (!masked)
+ unmask_evtchn(evtchn);
+
return 1;
}
@@ -551,13 +556,13 @@
#ifdef CONFIG_SMP
.set_affinity = set_affinity_irq,
#endif
- .retrigger = retrigger,
+ .retrigger = resend_irq_on_evtchn,
};
static inline void pirq_unmask_notify(int pirq)
{
struct physdev_eoi eoi = { .irq = pirq };
- if (unlikely(test_bit(pirq, &pirq_needs_eoi[0])))
+ if (unlikely(test_bit(pirq, pirq_needs_eoi)))
(void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
}
@@ -679,7 +684,7 @@
#ifdef CONFIG_SMP
.set_affinity = set_affinity_irq,
#endif
- .retrigger = retrigger,
+ .retrigger = resend_irq_on_evtchn,
};
int irq_ignore_unhandled(unsigned int irq)
@@ -693,16 +698,6 @@
return !!(irq_status.flags & XENIRQSTAT_shared);
}
-void resend_irq_on_evtchn(unsigned int i)
-{
- int evtchn = evtchn_from_irq(i);
- shared_info_t *s = HYPERVISOR_shared_info;
- if (!VALID_EVTCHN(evtchn))
- return;
- BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
- synch_set_bit(evtchn, &s->evtchn_pending[0]);
-}
-
void notify_remote_via_irq(int irq)
{
int evtchn = evtchn_from_irq(irq);
@@ -715,7 +710,7 @@
void mask_evtchn(int port)
{
shared_info_t *s = HYPERVISOR_shared_info;
- synch_set_bit(port, &s->evtchn_mask[0]);
+ synch_set_bit(port, s->evtchn_mask);
}
EXPORT_SYMBOL_GPL(mask_evtchn);
@@ -734,14 +729,10 @@
return;
}
- synch_clear_bit(port, &s->evtchn_mask[0]);
+ synch_clear_bit(port, s->evtchn_mask);
- /*
- * The following is basically the equivalent of 'hw_resend_irq'. Just
- * like a real IO-APIC we 'lose the interrupt edge' if the channel is
- * masked.
- */
- if (synch_test_bit(port, &s->evtchn_pending[0]) &&
+ /* Did we miss an interrupt 'edge'? Re-fire if so. */
+ if (synch_test_bit(port, s->evtchn_pending) &&
!synch_test_and_set_bit(port / BITS_PER_LONG,
&vcpu_info->evtchn_pending_sel))
vcpu_info->evtchn_upcall_pending = 1;
Index: source-amd64-xen/include/xen/evtchn.h
===================================================================
--- source-amd64-xen.orig/include/xen/evtchn.h 2008-06-03 21:11:48.000000000 +0100
+++ source-amd64-xen/include/xen/evtchn.h 2008-06-03 21:18:30.000000000 +0100
@@ -95,7 +95,7 @@
static inline void clear_evtchn(int port)
{
shared_info_t *s = HYPERVISOR_shared_info;
- synch_clear_bit(port, &s->evtchn_pending[0]);
+ synch_clear_bit(port, s->evtchn_pending);
}
static inline void notify_remote_via_evtchn(int port)
Attachment:
signature.asc
Description: This is a digitally signed message part