Merge "sched/fair: Block delayed tasks on throttled hierarchy during dequeue"
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
From 753594c69939fb8fbf6971e7f4858052e0978bf1 Mon Sep 17 00:00:00 2001
|
||||
From: Jiping Ma <jiping.ma2@windriver.com>
|
||||
Date: Mon, 13 Oct 2025 07:28:52 +0000
|
||||
Subject: [PATCH] sched/fair: Block delayed tasks on throttled hierarchy during
|
||||
dequeue
|
||||
|
||||
Dequeuing a fair task on a throttled hierarchy returns early on
|
||||
encountering a throttled cfs_rq since the throttle path has already
|
||||
dequeued the hierarchy above and has adjusted the h_nr_* accounting till
|
||||
the root cfs_rq.
|
||||
|
||||
dequeue_entities() crucially misses calling __block_task() for delayed
|
||||
tasks being dequeued on the throttled hierarchies, but this was mostly
|
||||
harmless until commit b7ca5743a260 ("sched/core: Tweak
|
||||
wait_task_inactive() to force dequeue sched_delayed tasks") since all
|
||||
existing cases would re-enqueue the task if task_on_rq_queued() returned
|
||||
true and the task would eventually be blocked at pick after the
|
||||
hierarchy was unthrottled.
|
||||
|
||||
wait_task_inactive() is special as it expects the delayed task on
|
||||
throttled hierarchy to reach the blocked state on dequeue but since
|
||||
__block_task() is never called, task_on_rq_queued() continues to return
|
||||
true. Furthermore, since the task is now off the hierarchy, the pick
|
||||
never reaches it to fully block the task even after unthrottle leading
|
||||
to wait_task_inactive() looping endlessly.
|
||||
|
||||
Remedy this by calling __block_task() if a delayed task is being
|
||||
dequeued on a throttled hierarchy.
|
||||
|
||||
This fix is only required for stabled kernels implementing delay dequeue
|
||||
(>= v6.12) before v6.18 since upstream commit e1fad12dcb66 ("sched/fair:
|
||||
Switch to task based throttle model") indirectly fixes this by removing
|
||||
the early return conditions in dequeue_entities() as part of the per-task
|
||||
throttle feature.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reported-by: Matt Fleming <matt@readmodwrite.com>
|
||||
Closes: https://lore.kernel.org/all/20250925133310.1843863-1-matt@readmodwrite.com/
|
||||
Fixes: b7ca5743a260 ("sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks")
|
||||
Tested-by: Matt Fleming <mfleming@cloudflare.com>
|
||||
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
kernel/sched/fair.c | 9 ++++++---
|
||||
1 file changed, 6 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index d894ccc8cb8f..64fd09e5bb79 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -7187,6 +7187,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
int h_nr_delayed = 0;
|
||||
struct cfs_rq *cfs_rq;
|
||||
u64 slice = 0;
|
||||
+ int ret = 0;
|
||||
|
||||
if (entity_is_task(se)) {
|
||||
p = task_of(se);
|
||||
@@ -7218,7 +7219,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
- return 0;
|
||||
+ goto out;
|
||||
|
||||
/* Don't dequeue parent if it has other entities besides us */
|
||||
if (cfs_rq->load.weight) {
|
||||
@@ -7261,7 +7262,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
- return 0;
|
||||
+ goto out;
|
||||
}
|
||||
|
||||
sub_nr_running(rq, h_nr_queued);
|
||||
@@ -7273,6 +7274,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
|
||||
rq->next_balance = jiffies;
|
||||
|
||||
+ ret = 1;
|
||||
+out:
|
||||
if (p && task_delayed) {
|
||||
SCHED_WARN_ON(!task_sleep);
|
||||
SCHED_WARN_ON(p->on_rq != 1);
|
||||
@@ -7288,7 +7291,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
__block_task(rq, p);
|
||||
}
|
||||
|
||||
- return 1;
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
--
|
||||
2.49.0
|
||||
|
||||
@@ -1,130 +0,0 @@
|
||||
From 76e4466a6737de53f0f9e53ddb2b72cad8fc8d4d Mon Sep 17 00:00:00 2001
|
||||
From: Jiping Ma <jiping.ma2@windriver.com>
|
||||
Date: Sat, 16 Aug 2025 00:26:26 +0000
|
||||
Subject: [PATCH] sched/fair: Fix DELAY_DEQUEUE issue related to cgroup
|
||||
throttling
|
||||
|
||||
When both CPU cgroup and memory cgroup are enabled with parent cgroup
|
||||
resource limits much smaller than child cgroup's, the system frequently
|
||||
hangs with NULL pointer dereference:
|
||||
|
||||
Unable to handle kernel NULL pointer dereference
|
||||
at virtual address 0000000000000051
|
||||
Internal error: Oops: 0000000096000006 [#1] PREEMPT_RT SMP
|
||||
pc : pick_task_fair+0x68/0x150
|
||||
Call trace:
|
||||
pick_task_fair+0x68/0x150
|
||||
pick_next_task_fair+0x30/0x3b8
|
||||
__schedule+0x180/0xb98
|
||||
preempt_schedule+0x48/0x60
|
||||
rt_mutex_slowunlock+0x298/0x340
|
||||
rt_spin_unlock+0x84/0xa0
|
||||
page_vma_mapped_walk+0x1c8/0x478
|
||||
folio_referenced_one+0xdc/0x490
|
||||
rmap_walk_file+0x11c/0x200
|
||||
folio_referenced+0x160/0x1e8
|
||||
shrink_folio_list+0x5c4/0xc60
|
||||
shrink_lruvec+0x5f8/0xb88
|
||||
shrink_node+0x308/0x940
|
||||
do_try_to_free_pages+0xd4/0x540
|
||||
try_to_free_mem_cgroup_pages+0x12c/0x2c0
|
||||
|
||||
The issue can be mitigated by increasing parent cgroup's CPU resources,
|
||||
or completely resolved by disabling DELAY_DEQUEUE feature.
|
||||
|
||||
SCHED_FEAT(DELAY_DEQUEUE, false)
|
||||
|
||||
With CONFIG_SCHED_DEBUG enabled, the following warning appears:
|
||||
|
||||
WARNING: CPU: 1 PID: 27 at kernel/sched/fair.c:704 update_entity_lag+0xa8/0xd0
|
||||
!se->on_rq
|
||||
Call trace:
|
||||
update_entity_lag+0xa8/0xd0
|
||||
dequeue_entity+0x90/0x538
|
||||
dequeue_entities+0xd0/0x490
|
||||
dequeue_task_fair+0xcc/0x230
|
||||
rt_mutex_setprio+0x2ec/0x4d8
|
||||
rtlock_slowlock_locked+0x6c8/0xce8
|
||||
|
||||
The warning indicates se->on_rq is 0, meaning dequeue_entity() was
|
||||
entered at least twice and executed update_entity_lag().
|
||||
|
||||
Root cause analysis:
|
||||
In rt_mutex_setprio(), there are two dequeue_task() calls:
|
||||
1. First call: dequeue immediately if task is delay-dequeued
|
||||
2. Second call: dequeue running tasks
|
||||
|
||||
Through debugging, we observed that for the same task, both dequeue_task()
|
||||
calls are actually executed. The task is a sched_delayed task on cfs_rq,
|
||||
which confirms our analysis that dequeue_entity() is entered at least
|
||||
twice.
|
||||
|
||||
Semantically, rt_mutex handles scheduling and priority inheritance, and
|
||||
should only dequeue/enqueue running tasks. A sched_delayed task is
|
||||
essentially non-running, so the second dequeue_task() should not execute.
|
||||
|
||||
Further analysis of dequeue_entities() shows multiple cfs_rq_throttled()
|
||||
checks. At the function's end, __block_task() updates sched_delayed
|
||||
tasks to non-running state. However, when cgroup throttling occurs, the
|
||||
function returns early without executing __block_task(), leaving the
|
||||
sched_delayed task in running state. This causes the unexpected second
|
||||
dequeue_task() in rt_mutex_setprio(), leading to system crash.
|
||||
|
||||
We initially tried modifying the two cfs_rq_throttled() return points in
|
||||
dequeue_entities() to jump to the __block_task() condition check, which
|
||||
resolved the issue completely.
|
||||
|
||||
This patch takes a cleaner approach by moving the __block_task()
|
||||
operation from dequeue_entities() to finish_delayed_dequeue_entity(),
|
||||
ensuring sched_delayed tasks are properly marked as non-running
|
||||
regardless of cgroup throttling status.
|
||||
|
||||
Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
|
||||
Signed-off-by: Han Guangjiang <hanguangjiang@lixiang.com>
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
kernel/sched/fair.c | 21 ++++++---------------
|
||||
1 file changed, 6 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index d894ccc8cb8f..5fa7df1aa3d9 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -5546,6 +5546,12 @@ static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
|
||||
clear_delayed(se);
|
||||
if (sched_feat(DELAY_ZERO) && se->vlag > 0)
|
||||
se->vlag = 0;
|
||||
+
|
||||
+ if (entity_is_task(se)) {
|
||||
+ struct task_struct *p = task_of(se);
|
||||
+
|
||||
+ __block_task(task_rq(p), p);
|
||||
+ }
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -7273,21 +7279,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
|
||||
rq->next_balance = jiffies;
|
||||
|
||||
- if (p && task_delayed) {
|
||||
- SCHED_WARN_ON(!task_sleep);
|
||||
- SCHED_WARN_ON(p->on_rq != 1);
|
||||
-
|
||||
- /* Fix-up what dequeue_task_fair() skipped */
|
||||
- hrtick_update(rq);
|
||||
-
|
||||
- /*
|
||||
- * Fix-up what block_task() skipped.
|
||||
- *
|
||||
- * Must be last, @p might not be valid after this.
|
||||
- */
|
||||
- __block_task(rq, p);
|
||||
- }
|
||||
-
|
||||
return 1;
|
||||
}
|
||||
|
||||
--
|
||||
2.49.0
|
||||
|
||||
@@ -38,4 +38,4 @@ zl3073x-backport/0003-devlink-introduce-devlink_nl_put_u64.patch
|
||||
zl3073x-backport/0004-dpll-zl3073x-Fix-missing-header-build-error-on-older.patch
|
||||
zl3073x-backport/0005-dpll-add-phase-offset-monitor-feature-to-netlink-spe.patch
|
||||
zl3073x-backport/0006-dpll-add-phase_offset_monitor_get-set-callback-ops.patch
|
||||
0015-sched-fair-Fix-DELAY_DEQUEUE-issue-related-to-cgroup.patch
|
||||
0015-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
From 753594c69939fb8fbf6971e7f4858052e0978bf1 Mon Sep 17 00:00:00 2001
|
||||
From: Jiping Ma <jiping.ma2@windriver.com>
|
||||
Date: Mon, 13 Oct 2025 07:28:52 +0000
|
||||
Subject: [PATCH] sched/fair: Block delayed tasks on throttled hierarchy during
|
||||
dequeue
|
||||
|
||||
Dequeuing a fair task on a throttled hierarchy returns early on
|
||||
encountering a throttled cfs_rq since the throttle path has already
|
||||
dequeued the hierarchy above and has adjusted the h_nr_* accounting till
|
||||
the root cfs_rq.
|
||||
|
||||
dequeue_entities() crucially misses calling __block_task() for delayed
|
||||
tasks being dequeued on the throttled hierarchies, but this was mostly
|
||||
harmless until commit b7ca5743a260 ("sched/core: Tweak
|
||||
wait_task_inactive() to force dequeue sched_delayed tasks") since all
|
||||
existing cases would re-enqueue the task if task_on_rq_queued() returned
|
||||
true and the task would eventually be blocked at pick after the
|
||||
hierarchy was unthrottled.
|
||||
|
||||
wait_task_inactive() is special as it expects the delayed task on
|
||||
throttled hierarchy to reach the blocked state on dequeue but since
|
||||
__block_task() is never called, task_on_rq_queued() continues to return
|
||||
true. Furthermore, since the task is now off the hierarchy, the pick
|
||||
never reaches it to fully block the task even after unthrottle leading
|
||||
to wait_task_inactive() looping endlessly.
|
||||
|
||||
Remedy this by calling __block_task() if a delayed task is being
|
||||
dequeued on a throttled hierarchy.
|
||||
|
||||
This fix is only required for stabled kernels implementing delay dequeue
|
||||
(>= v6.12) before v6.18 since upstream commit e1fad12dcb66 ("sched/fair:
|
||||
Switch to task based throttle model") indirectly fixes this by removing
|
||||
the early return conditions in dequeue_entities() as part of the per-task
|
||||
throttle feature.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reported-by: Matt Fleming <matt@readmodwrite.com>
|
||||
Closes: https://lore.kernel.org/all/20250925133310.1843863-1-matt@readmodwrite.com/
|
||||
Fixes: b7ca5743a260 ("sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks")
|
||||
Tested-by: Matt Fleming <mfleming@cloudflare.com>
|
||||
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
kernel/sched/fair.c | 9 ++++++---
|
||||
1 file changed, 6 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index d894ccc8cb8f..64fd09e5bb79 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -7187,6 +7187,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
int h_nr_delayed = 0;
|
||||
struct cfs_rq *cfs_rq;
|
||||
u64 slice = 0;
|
||||
+ int ret = 0;
|
||||
|
||||
if (entity_is_task(se)) {
|
||||
p = task_of(se);
|
||||
@@ -7218,7 +7219,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
- return 0;
|
||||
+ goto out;
|
||||
|
||||
/* Don't dequeue parent if it has other entities besides us */
|
||||
if (cfs_rq->load.weight) {
|
||||
@@ -7261,7 +7262,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
- return 0;
|
||||
+ goto out;
|
||||
}
|
||||
|
||||
sub_nr_running(rq, h_nr_queued);
|
||||
@@ -7273,6 +7274,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
|
||||
rq->next_balance = jiffies;
|
||||
|
||||
+ ret = 1;
|
||||
+out:
|
||||
if (p && task_delayed) {
|
||||
SCHED_WARN_ON(!task_sleep);
|
||||
SCHED_WARN_ON(p->on_rq != 1);
|
||||
@@ -7288,7 +7291,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
__block_task(rq, p);
|
||||
}
|
||||
|
||||
- return 1;
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
--
|
||||
2.49.0
|
||||
|
||||
@@ -37,3 +37,4 @@ zl3073x-backport/0003-devlink-introduce-devlink_nl_put_u64.patch
|
||||
zl3073x-backport/0004-dpll-zl3073x-Fix-missing-header-build-error-on-older.patch
|
||||
zl3073x-backport/0005-dpll-add-phase-offset-monitor-feature-to-netlink-spe.patch
|
||||
zl3073x-backport/0006-dpll-add-phase_offset_monitor_get-set-callback-ops.patch
|
||||
0014-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
|
||||
|
||||
Reference in New Issue
Block a user