Merge "sched/fair: Block delayed tasks on throttled hierarchy during dequeue"

2025-10-26 00:34:41 +00:00
parent 5d72fd131e 19a00692b9
commit 5af84120a1
5 changed files with 196 additions and 131 deletions
--- a/kernel-rt/debian/patches/0015-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
+++ b/kernel-rt/debian/patches/0015-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
@@ -0,0 +1,97 @@
+From 753594c69939fb8fbf6971e7f4858052e0978bf1 Mon Sep 17 00:00:00 2001
+From: Jiping Ma <jiping.ma2@windriver.com>
+Date: Mon, 13 Oct 2025 07:28:52 +0000
+Subject: [PATCH] sched/fair: Block delayed tasks on throttled hierarchy during
+ dequeue
+
+Dequeuing a fair task on a throttled hierarchy returns early on
+encountering a throttled cfs_rq since the throttle path has already
+dequeued the hierarchy above and has adjusted the h_nr_* accounting till
+the root cfs_rq.
+
+dequeue_entities() crucially misses calling __block_task() for delayed
+tasks being dequeued on the throttled hierarchies, but this was mostly
+harmless until commit b7ca5743a260 ("sched/core: Tweak
+wait_task_inactive() to force dequeue sched_delayed tasks") since all
+existing cases would re-enqueue the task if task_on_rq_queued() returned
+true and the task would eventually be blocked at pick after the
+hierarchy was unthrottled.
+
+wait_task_inactive() is special as it expects the delayed task on
+throttled hierarchy to reach the blocked state on dequeue but since
+__block_task() is never called, task_on_rq_queued() continues to return
+true. Furthermore, since the task is now off the hierarchy, the pick
+never reaches it to fully block the task even after unthrottle leading
+to wait_task_inactive() looping endlessly.
+
+Remedy this by calling __block_task() if a delayed task is being
+dequeued on a throttled hierarchy.
+
+This fix is only required for stabled kernels implementing delay dequeue
+(>= v6.12) before v6.18 since upstream commit e1fad12dcb66 ("sched/fair:
+Switch to task based throttle model") indirectly fixes this by removing
+the early return conditions in dequeue_entities() as part of the per-task
+throttle feature.
+
+Cc: stable@vger.kernel.org
+Reported-by: Matt Fleming <matt@readmodwrite.com>
+Closes: https://lore.kernel.org/all/20250925133310.1843863-1-matt@readmodwrite.com/
+Fixes: b7ca5743a260 ("sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks")
+Tested-by: Matt Fleming <mfleming@cloudflare.com>
+Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
+---
+ kernel/sched/fair.c | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index d894ccc8cb8f..64fd09e5bb79 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -7187,6 +7187,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 	int h_nr_delayed = 0;
+ 	struct cfs_rq *cfs_rq;
+ 	u64 slice = 0;
+	int ret = 0;
+ 
+ 	if (entity_is_task(se)) {
+ 		p = task_of(se);
+@@ -7218,7 +7219,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 
+ 		/* end evaluation on encountering a throttled cfs_rq */
+ 		if (cfs_rq_throttled(cfs_rq))
+-			return 0;
+			goto out;
+ 
+ 		/* Don't dequeue parent if it has other entities besides us */
+ 		if (cfs_rq->load.weight) {
+@@ -7261,7 +7262,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 
+ 		/* end evaluation on encountering a throttled cfs_rq */
+ 		if (cfs_rq_throttled(cfs_rq))
+-			return 0;
+			goto out;
+ 	}
+ 
+ 	sub_nr_running(rq, h_nr_queued);
+@@ -7273,6 +7274,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ 		rq->next_balance = jiffies;
+ 
+	ret = 1;
+out:
+ 	if (p && task_delayed) {
+ 		SCHED_WARN_ON(!task_sleep);
+ 		SCHED_WARN_ON(p->on_rq != 1);
+@@ -7288,7 +7291,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 		__block_task(rq, p);
+ 	}
+ 
+-	return 1;
+	return ret;
+ }
+ 
+ /*
+-- 
+2.49.0
+
--- a/kernel-rt/debian/patches/0015-sched-fair-Fix-DELAY_DEQUEUE-issue-related-to-cgroup.patch
+++ b/kernel-rt/debian/patches/0015-sched-fair-Fix-DELAY_DEQUEUE-issue-related-to-cgroup.patch
@@ -1,130 +0,0 @@
-From 76e4466a6737de53f0f9e53ddb2b72cad8fc8d4d Mon Sep 17 00:00:00 2001
-From: Jiping Ma <jiping.ma2@windriver.com>
-Date: Sat, 16 Aug 2025 00:26:26 +0000
-Subject: [PATCH] sched/fair: Fix DELAY_DEQUEUE issue related to cgroup
- throttling
-
-When both CPU cgroup and memory cgroup are enabled with parent cgroup
-resource limits much smaller than child cgroup's, the system frequently
-hangs with NULL pointer dereference:
-
-Unable to handle kernel NULL pointer dereference
-at virtual address 0000000000000051
-Internal error: Oops: 0000000096000006 [#1] PREEMPT_RT SMP
-pc : pick_task_fair+0x68/0x150
-Call trace:
- pick_task_fair+0x68/0x150
- pick_next_task_fair+0x30/0x3b8
- __schedule+0x180/0xb98
- preempt_schedule+0x48/0x60
- rt_mutex_slowunlock+0x298/0x340
- rt_spin_unlock+0x84/0xa0
- page_vma_mapped_walk+0x1c8/0x478
- folio_referenced_one+0xdc/0x490
- rmap_walk_file+0x11c/0x200
- folio_referenced+0x160/0x1e8
- shrink_folio_list+0x5c4/0xc60
- shrink_lruvec+0x5f8/0xb88
- shrink_node+0x308/0x940
- do_try_to_free_pages+0xd4/0x540
- try_to_free_mem_cgroup_pages+0x12c/0x2c0
-
-The issue can be mitigated by increasing parent cgroup's CPU resources,
-or completely resolved by disabling DELAY_DEQUEUE feature.
-
-SCHED_FEAT(DELAY_DEQUEUE, false)
-
-With CONFIG_SCHED_DEBUG enabled, the following warning appears:
-
-WARNING: CPU: 1 PID: 27 at kernel/sched/fair.c:704 update_entity_lag+0xa8/0xd0
-!se->on_rq
-Call trace:
- update_entity_lag+0xa8/0xd0
- dequeue_entity+0x90/0x538
- dequeue_entities+0xd0/0x490
- dequeue_task_fair+0xcc/0x230
- rt_mutex_setprio+0x2ec/0x4d8
- rtlock_slowlock_locked+0x6c8/0xce8
-
-The warning indicates se->on_rq is 0, meaning dequeue_entity() was
-entered at least twice and executed update_entity_lag().
-
-Root cause analysis:
-In rt_mutex_setprio(), there are two dequeue_task() calls:
-1. First call: dequeue immediately if task is delay-dequeued
-2. Second call: dequeue running tasks
-
-Through debugging, we observed that for the same task, both dequeue_task()
-calls are actually executed. The task is a sched_delayed task on cfs_rq,
-which confirms our analysis that dequeue_entity() is entered at least
-twice.
-
-Semantically, rt_mutex handles scheduling and priority inheritance, and
-should only dequeue/enqueue running tasks. A sched_delayed task is
-essentially non-running, so the second dequeue_task() should not execute.
-
-Further analysis of dequeue_entities() shows multiple cfs_rq_throttled()
-checks. At the function's end, __block_task() updates sched_delayed
-tasks to non-running state. However, when cgroup throttling occurs, the
-function returns early without executing __block_task(), leaving the
-sched_delayed task in running state. This causes the unexpected second
-dequeue_task() in rt_mutex_setprio(), leading to system crash.
-
-We initially tried modifying the two cfs_rq_throttled() return points in
-dequeue_entities() to jump to the __block_task() condition check, which
-resolved the issue completely.
-
-This patch takes a cleaner approach by moving the __block_task()
-operation from dequeue_entities() to finish_delayed_dequeue_entity(),
-ensuring sched_delayed tasks are properly marked as non-running
-regardless of cgroup throttling status.
-
-Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
-Signed-off-by: Han Guangjiang <hanguangjiang@lixiang.com>
-Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
---
- kernel/sched/fair.c | 21 ++++++---------------
- 1 file changed, 6 insertions(+), 15 deletions(-)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index d894ccc8cb8f..5fa7df1aa3d9 100644
--- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -5546,6 +5546,12 @@ static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
- 	clear_delayed(se);
- 	if (sched_feat(DELAY_ZERO) && se->vlag > 0)
- 		se->vlag = 0;
-+
-+	if (entity_is_task(se)) {
-+		struct task_struct *p = task_of(se);
-+
-+		__block_task(task_rq(p), p);
-+	}
- }
- 
- static bool
-@@ -7273,21 +7279,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
- 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
- 		rq->next_balance = jiffies;
- 
-	if (p && task_delayed) {
-		SCHED_WARN_ON(!task_sleep);
-		SCHED_WARN_ON(p->on_rq != 1);
-
-		/* Fix-up what dequeue_task_fair() skipped */
-		hrtick_update(rq);
-
-		/*
-		 * Fix-up what block_task() skipped.
-		 *
-		 * Must be last, @p might not be valid after this.
-		 */
-		__block_task(rq, p);
-	}
-
- 	return 1;
- }
- 
-- 
-2.49.0
-
--- a/kernel-rt/debian/patches/series
+++ b/kernel-rt/debian/patches/series
@@ -38,4 +38,4 @@ zl3073x-backport/0003-devlink-introduce-devlink_nl_put_u64.patch
 zl3073x-backport/0004-dpll-zl3073x-Fix-missing-header-build-error-on-older.patch
 zl3073x-backport/0005-dpll-add-phase-offset-monitor-feature-to-netlink-spe.patch
 zl3073x-backport/0006-dpll-add-phase_offset_monitor_get-set-callback-ops.patch
-0015-sched-fair-Fix-DELAY_DEQUEUE-issue-related-to-cgroup.patch
+0015-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
--- a/kernel-std/debian/patches/0014-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
+++ b/kernel-std/debian/patches/0014-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
@@ -0,0 +1,97 @@
+From 753594c69939fb8fbf6971e7f4858052e0978bf1 Mon Sep 17 00:00:00 2001
+From: Jiping Ma <jiping.ma2@windriver.com>
+Date: Mon, 13 Oct 2025 07:28:52 +0000
+Subject: [PATCH] sched/fair: Block delayed tasks on throttled hierarchy during
+ dequeue
+
+Dequeuing a fair task on a throttled hierarchy returns early on
+encountering a throttled cfs_rq since the throttle path has already
+dequeued the hierarchy above and has adjusted the h_nr_* accounting till
+the root cfs_rq.
+
+dequeue_entities() crucially misses calling __block_task() for delayed
+tasks being dequeued on the throttled hierarchies, but this was mostly
+harmless until commit b7ca5743a260 ("sched/core: Tweak
+wait_task_inactive() to force dequeue sched_delayed tasks") since all
+existing cases would re-enqueue the task if task_on_rq_queued() returned
+true and the task would eventually be blocked at pick after the
+hierarchy was unthrottled.
+
+wait_task_inactive() is special as it expects the delayed task on
+throttled hierarchy to reach the blocked state on dequeue but since
+__block_task() is never called, task_on_rq_queued() continues to return
+true. Furthermore, since the task is now off the hierarchy, the pick
+never reaches it to fully block the task even after unthrottle leading
+to wait_task_inactive() looping endlessly.
+
+Remedy this by calling __block_task() if a delayed task is being
+dequeued on a throttled hierarchy.
+
+This fix is only required for stabled kernels implementing delay dequeue
+(>= v6.12) before v6.18 since upstream commit e1fad12dcb66 ("sched/fair:
+Switch to task based throttle model") indirectly fixes this by removing
+the early return conditions in dequeue_entities() as part of the per-task
+throttle feature.
+
+Cc: stable@vger.kernel.org
+Reported-by: Matt Fleming <matt@readmodwrite.com>
+Closes: https://lore.kernel.org/all/20250925133310.1843863-1-matt@readmodwrite.com/
+Fixes: b7ca5743a260 ("sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks")
+Tested-by: Matt Fleming <mfleming@cloudflare.com>
+Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
+---
+ kernel/sched/fair.c | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index d894ccc8cb8f..64fd09e5bb79 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -7187,6 +7187,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 	int h_nr_delayed = 0;
+ 	struct cfs_rq *cfs_rq;
+ 	u64 slice = 0;
+	int ret = 0;
+ 
+ 	if (entity_is_task(se)) {
+ 		p = task_of(se);
+@@ -7218,7 +7219,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 
+ 		/* end evaluation on encountering a throttled cfs_rq */
+ 		if (cfs_rq_throttled(cfs_rq))
+-			return 0;
+			goto out;
+ 
+ 		/* Don't dequeue parent if it has other entities besides us */
+ 		if (cfs_rq->load.weight) {
+@@ -7261,7 +7262,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 
+ 		/* end evaluation on encountering a throttled cfs_rq */
+ 		if (cfs_rq_throttled(cfs_rq))
+-			return 0;
+			goto out;
+ 	}
+ 
+ 	sub_nr_running(rq, h_nr_queued);
+@@ -7273,6 +7274,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ 		rq->next_balance = jiffies;
+ 
+	ret = 1;
+out:
+ 	if (p && task_delayed) {
+ 		SCHED_WARN_ON(!task_sleep);
+ 		SCHED_WARN_ON(p->on_rq != 1);
+@@ -7288,7 +7291,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ 		__block_task(rq, p);
+ 	}
+ 
+-	return 1;
+	return ret;
+ }
+ 
+ /*
+-- 
+2.49.0
+
--- a/kernel-std/debian/patches/series
+++ b/kernel-std/debian/patches/series
@@ -37,3 +37,4 @@ zl3073x-backport/0003-devlink-introduce-devlink_nl_put_u64.patch
 zl3073x-backport/0004-dpll-zl3073x-Fix-missing-header-build-error-on-older.patch
 zl3073x-backport/0005-dpll-add-phase-offset-monitor-feature-to-netlink-spe.patch
 zl3073x-backport/0006-dpll-add-phase_offset_monitor_get-set-callback-ops.patch
+0014-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch