[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#517449: linux-image-2.6.26-1-amd64: SCHED_IDLE issues (tasks blocked for more than 120 seconds)



You found some scheduling changes made post-2.6.28 which seemed to deal
with this bug.  However, that cannot be the full story because they are
modifying code which was added after 2.6.26.

The attached patch combines a bug fix made between 2.6.26 and .28 with
most of the changes you identified.  Based on my reading of commit
messages and discussion, I think this makes a coherent set of changes.
However there have been many other changes to the scheduler in this time
and I cannot say for sure whether any of those are also required as I do
not have any great knowledge of it.

Please try rebuilding the "lenny" kernel (linux-source-2.6.26) with this
patch applied and report whether it fixes the bug for you.

Ben.

-- 
Ben Hutchings
Logic doesn't apply to the real world. - Marvin Minsky
Combination of these scheduler fixes:

commit 1af5f730fc1bf7c62ec9fb2d307206e18bf40a69
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Fri Oct 24 11:06:13 2008 +0200

    sched: more accurate min_vruntime accounting

[part of:]
commit 6bc912b71b6f33b041cfde93ca3f019cbaa852bc
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Jan 15 14:53:38 2009 +0100

    sched: SCHED_OTHER vs SCHED_IDLE isolation

commit cce7ade803699463ecc62a065ca522004f7ccb3d
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Jan 15 14:53:37 2009 +0100

    sched: SCHED_IDLE weight change

commit e17036dac189dd034c092a91df56aa740db7146d
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Jan 15 14:53:39 2009 +0100

    sched: fix update_min_vruntime

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1390,8 +1390,8 @@
  * slice expiry etc.
  */
 
-#define WEIGHT_IDLEPRIO		2
-#define WMULT_IDLEPRIO		(1 << 31)
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
 
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -221,6 +221,27 @@
 	return se->vruntime - cfs_rq->min_vruntime;
 }
 
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+	u64 vruntime = cfs_rq->min_vruntime;
+
+	if (cfs_rq->curr)
+		vruntime = cfs_rq->curr->vruntime;
+
+	if (cfs_rq->rb_leftmost) {
+		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+						   struct sched_entity,
+						   run_node);
+
+		if (!cfs_rq->curr)
+			vruntime = se->vruntime;
+		else
+			vruntime = min_vruntime(vruntime, se->vruntime);
+	}
+
+	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
+
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -254,15 +275,8 @@
 	 * Maintain a cache of leftmost tree entries (it is frequently
 	 * used):
 	 */
-	if (leftmost) {
+	if (leftmost)
 		cfs_rq->rb_leftmost = &se->run_node;
-		/*
-		 * maintain cfs_rq->min_vruntime to be a monotonic increasing
-		 * value tracking the leftmost vruntime in the tree.
-		 */
-		cfs_rq->min_vruntime =
-			max_vruntime(cfs_rq->min_vruntime, se->vruntime);
-	}
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,18 +286,9 @@
 {
 	if (cfs_rq->rb_leftmost == &se->run_node) {
 		struct rb_node *next_node;
-		struct sched_entity *next;
 
 		next_node = rb_next(&se->run_node);
 		cfs_rq->rb_leftmost = next_node;
-
-		if (next_node) {
-			next = rb_entry(next_node,
-					struct sched_entity, run_node);
-			cfs_rq->min_vruntime =
-				max_vruntime(cfs_rq->min_vruntime,
-					     next->vruntime);
-		}
 	}
 
 	if (cfs_rq->next == se)
@@ -425,6 +430,7 @@
 							&curr->load);
 	}
 	curr->vruntime += delta_exec_weighted;
+	update_min_vruntime(cfs_rq);
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -590,13 +596,7 @@
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
-	u64 vruntime;
-
-	if (first_fair(cfs_rq)) {
-		vruntime = min_vruntime(cfs_rq->min_vruntime,
-				__pick_next_entity(cfs_rq)->vruntime);
-	} else
-		vruntime = cfs_rq->min_vruntime;
+	u64 vruntime = cfs_rq->min_vruntime;
 
 	/*
 	 * The 'current' period is already promised to the current tasks,
@@ -680,6 +680,7 @@
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	account_entity_dequeue(cfs_rq, se);
+	update_min_vruntime(cfs_rq);
 }
 
 /*
@@ -1184,12 +1185,18 @@
 	cfs_rq_of(pse)->next = pse;
 
 	/*
-	 * Batch tasks do not preempt (their preemption is driven by
+	 * Batch and idle tasks do not preempt (their preemption is driven by
 	 * the tick):
 	 */
-	if (unlikely(p->policy == SCHED_BATCH))
+	if (unlikely(p->policy != SCHED_NORMAL))
 		return;
 
+	/* Idle tasks are by definition preempted by everybody. */
+	if (unlikely(curr->policy == SCHED_IDLE)) {
+		resched_task(curr);
+		return;
+	}
+
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 

Attachment: signature.asc
Description: This is a digitally signed message part


Reply to: