You found some scheduling changes made post-2.6.28 which seemed to deal with this bug. However, that cannot be the full story because they are modifying code which was added after 2.6.26. The attached patch combines a bug fix made between 2.6.26 and .28 with most of the changes you identified. Based on my reading of commit messages and discussion, I think this makes a coherent set of changes. However there have been many other changes to the scheduler in this time and I cannot say for sure whether any of those are also required as I do not have any great knowledge of it. Please try rebuilding the "lenny" kernel (linux-source-2.6.26) with this patch applied and report whether it fixes the bug for you. Ben. -- Ben Hutchings Logic doesn't apply to the real world. - Marvin Minsky
Combination of these scheduler fixes:
commit 1af5f730fc1bf7c62ec9fb2d307206e18bf40a69
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri Oct 24 11:06:13 2008 +0200
sched: more accurate min_vruntime accounting
[part of:]
commit 6bc912b71b6f33b041cfde93ca3f019cbaa852bc
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu Jan 15 14:53:38 2009 +0100
sched: SCHED_OTHER vs SCHED_IDLE isolation
commit cce7ade803699463ecc62a065ca522004f7ccb3d
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu Jan 15 14:53:37 2009 +0100
sched: SCHED_IDLE weight change
commit e17036dac189dd034c092a91df56aa740db7146d
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu Jan 15 14:53:39 2009 +0100
sched: fix update_min_vruntime
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1390,8 +1390,8 @@
* slice expiry etc.
*/
-#define WEIGHT_IDLEPRIO 2
-#define WMULT_IDLEPRIO (1 << 31)
+#define WEIGHT_IDLEPRIO 3
+#define WMULT_IDLEPRIO 1431655765
/*
* Nice levels are multiplicative, with a gentle 10% change for every
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -221,6 +221,27 @@
return se->vruntime - cfs_rq->min_vruntime;
}
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ if (cfs_rq->curr)
+ vruntime = cfs_rq->curr->vruntime;
+
+ if (cfs_rq->rb_leftmost) {
+ struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+ struct sched_entity,
+ run_node);
+
+ if (!cfs_rq->curr)
+ vruntime = se->vruntime;
+ else
+ vruntime = min_vruntime(vruntime, se->vruntime);
+ }
+
+ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
+
/*
* Enqueue an entity into the rb-tree:
*/
@@ -254,15 +275,8 @@
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
- if (leftmost) {
+ if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
- /*
- * maintain cfs_rq->min_vruntime to be a monotonic increasing
- * value tracking the leftmost vruntime in the tree.
- */
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime, se->vruntime);
- }
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,18 +286,9 @@
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
- struct sched_entity *next;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
-
- if (next_node) {
- next = rb_entry(next_node,
- struct sched_entity, run_node);
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime,
- next->vruntime);
- }
}
if (cfs_rq->next == se)
@@ -425,6 +430,7 @@
&curr->load);
}
curr->vruntime += delta_exec_weighted;
+ update_min_vruntime(cfs_rq);
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -590,13 +596,7 @@
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
- u64 vruntime;
-
- if (first_fair(cfs_rq)) {
- vruntime = min_vruntime(cfs_rq->min_vruntime,
- __pick_next_entity(cfs_rq)->vruntime);
- } else
- vruntime = cfs_rq->min_vruntime;
+ u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
@@ -680,6 +680,7 @@
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
+ update_min_vruntime(cfs_rq);
}
/*
@@ -1184,12 +1185,18 @@
cfs_rq_of(pse)->next = pse;
/*
- * Batch tasks do not preempt (their preemption is driven by
+ * Batch and idle tasks do not preempt (their preemption is driven by
* the tick):
*/
- if (unlikely(p->policy == SCHED_BATCH))
+ if (unlikely(p->policy != SCHED_NORMAL))
return;
+ /* Idle tasks are by definition preempted by everybody. */
+ if (unlikely(curr->policy == SCHED_IDLE)) {
+ resched_task(curr);
+ return;
+ }
+
if (!sched_feat(WAKEUP_PREEMPT))
return;
Attachment:
signature.asc
Description: This is a digitally signed message part