kernel-rt:dm-snapshot: fix 'scheduling while atomic' on rt kernel

We observed "BUG: scheduling while atomic: lvm/1380/0x00000003" for the rt kernel under the upgrade and rollback test in 6.6, 6.12 and mainline. The issue is related with dm_exception_table_lock(&lock), in which function preempt_disable() is called twice. The code block is between dm_exception_table_lock(&lock) and dm_exception_table_unlock(&lock), if the code involves rt_spin_lock that will trigger such as "BUG: scheduling while atomic: kworker/u72:11/349/0x00000003" because the preempt number is 3 in this time. There are several places that involve the same issue in dm-snap.c, such as dm_add_exception(), pending_complete() and snapshot_map(). Fix this by converting a hlist_bl spinlock into a regular spinlock. Cherry pick the upstream commit https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8581b19eb2c5 ("dm-snapshot: fix 'scheduling while atomic' on real-time kernels") Here we removed the patch 0015-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch, it was not required after the kernel upgrade to 6.12.57. Verification: - Build iso success for rt and std. - Run the tests upgrade and rollback for more than one day. Closes-Bug: 2136084 Change-Id: I7a7d094c60f8f8fd9f4da3c441fcda89f6048241 Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
2025-12-15 02:29:34 +00:00
parent a3621563d6
commit f4e9e7d00f
5 changed files with 474 additions and 97 deletions
--- a/kernel-rt/debian/bullseye/patches/0015-dm-snapshot-fix-scheduling-while-atomic-on-real-time.patch
+++ b/kernel-rt/debian/bullseye/patches/0015-dm-snapshot-fix-scheduling-while-atomic-on-real-time.patch
@@ -0,0 +1,236 @@
+From e94787a61d0f7617801bafaf5e63474683d76cd8 Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Mon, 1 Dec 2025 22:13:10 +0100
+Subject: [PATCH] dm-snapshot: fix 'scheduling while atomic' on real-time
+ kernels
+
+There is reported 'scheduling while atomic' bug when using dm-snapshot on
+real-time kernels. The reason for the bug is that the hlist_bl code does
+preempt_disable() when taking the lock and the kernel attempts to take
+other spinlocks while holding the hlist_bl lock.
+
+Fix this by converting a hlist_bl spinlock into a regular spinlock.
+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Reported-by: Jiping Ma <jiping.ma2@windriver.com>
+(cherry picked from commit 8581b19eb2c5ccf06c195d3b5468c3c9d17a5020)
+Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
+---
+ drivers/md/dm-exception-store.h |  2 +-
+ drivers/md/dm-snap.c            | 73 +++++++++++++++------------------
+ 2 files changed, 35 insertions(+), 40 deletions(-)
+
+diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
+index b67976637538..061b4d310813 100644
+--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
+@@ -29,7 +29,7 @@ typedef sector_t chunk_t;
+  * chunk within the device.
+  */
+ struct dm_exception {
+-	struct hlist_bl_node hash_list;
+	struct hlist_node hash_list;
+ 
+ 	chunk_t old_chunk;
+ 	chunk_t new_chunk;
+diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
+index f40c18da4000..dbd148967de4 100644
+--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
+@@ -40,10 +40,15 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
+ #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
+ 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+ 
+struct dm_hlist_head {
+	struct hlist_head head;
+	spinlock_t lock;
+};
+
+ struct dm_exception_table {
+ 	uint32_t hash_mask;
+ 	unsigned int hash_shift;
+-	struct hlist_bl_head *table;
+	struct dm_hlist_head *table;
+ };
+ 
+ struct dm_snapshot {
+@@ -628,8 +633,8 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
+ 
+ /* Lock to protect access to the completed and pending exception hash tables. */
+ struct dm_exception_table_lock {
+-	struct hlist_bl_head *complete_slot;
+-	struct hlist_bl_head *pending_slot;
+	spinlock_t *complete_slot;
+	spinlock_t *pending_slot;
+ };
+ 
+ static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
+@@ -638,20 +643,20 @@ static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
+ 	struct dm_exception_table *complete = &s->complete;
+ 	struct dm_exception_table *pending = &s->pending;
+ 
+-	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
+-	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
+	lock->complete_slot = &complete->table[exception_hash(complete, chunk)].lock;
+	lock->pending_slot = &pending->table[exception_hash(pending, chunk)].lock;
+ }
+ 
+ static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
+ {
+-	hlist_bl_lock(lock->complete_slot);
+-	hlist_bl_lock(lock->pending_slot);
+	spin_lock_nested(lock->complete_slot, 1);
+	spin_lock_nested(lock->pending_slot, 2);
+ }
+ 
+ static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
+ {
+-	hlist_bl_unlock(lock->pending_slot);
+-	hlist_bl_unlock(lock->complete_slot);
+	spin_unlock(lock->pending_slot);
+	spin_unlock(lock->complete_slot);
+ }
+ 
+ static int dm_exception_table_init(struct dm_exception_table *et,
+@@ -661,13 +666,15 @@ static int dm_exception_table_init(struct dm_exception_table *et,
+ 
+ 	et->hash_shift = hash_shift;
+ 	et->hash_mask = size - 1;
+-	et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head),
+	et->table = kvmalloc_array(size, sizeof(struct dm_hlist_head),
+ 				   GFP_KERNEL);
+ 	if (!et->table)
+ 		return -ENOMEM;
+ 
+-	for (i = 0; i < size; i++)
+-		INIT_HLIST_BL_HEAD(et->table + i);
+	for (i = 0; i < size; i++) {
+		INIT_HLIST_HEAD(&et->table[i].head);
+		spin_lock_init(&et->table[i].lock);
+	}
+ 
+ 	return 0;
+ }
+@@ -675,16 +682,17 @@ static int dm_exception_table_init(struct dm_exception_table *et,
+ static void dm_exception_table_exit(struct dm_exception_table *et,
+ 				    struct kmem_cache *mem)
+ {
+-	struct hlist_bl_head *slot;
+	struct dm_hlist_head *slot;
+ 	struct dm_exception *ex;
+-	struct hlist_bl_node *pos, *n;
+	struct hlist_node *pos;
+ 	int i, size;
+ 
+ 	size = et->hash_mask + 1;
+ 	for (i = 0; i < size; i++) {
+ 		slot = et->table + i;
+ 
+-		hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) {
+		hlist_for_each_entry_safe(ex, pos, &slot->head, hash_list) {
+			hlist_del(&ex->hash_list);
+ 			kmem_cache_free(mem, ex);
+ 			cond_resched();
+ 		}
+@@ -700,7 +708,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
+ 
+ static void dm_remove_exception(struct dm_exception *e)
+ {
+-	hlist_bl_del(&e->hash_list);
+	hlist_del(&e->hash_list);
+ }
+ 
+ /*
+@@ -710,12 +718,11 @@ static void dm_remove_exception(struct dm_exception *e)
+ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
+ 						chunk_t chunk)
+ {
+-	struct hlist_bl_head *slot;
+-	struct hlist_bl_node *pos;
+	struct hlist_head *slot;
+ 	struct dm_exception *e;
+ 
+-	slot = &et->table[exception_hash(et, chunk)];
+-	hlist_bl_for_each_entry(e, pos, slot, hash_list)
+	slot = &et->table[exception_hash(et, chunk)].head;
+	hlist_for_each_entry(e, slot, hash_list)
+ 		if (chunk >= e->old_chunk &&
+ 		    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
+ 			return e;
+@@ -762,18 +769,17 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
+ static void dm_insert_exception(struct dm_exception_table *eh,
+ 				struct dm_exception *new_e)
+ {
+-	struct hlist_bl_head *l;
+-	struct hlist_bl_node *pos;
+	struct hlist_head *l;
+ 	struct dm_exception *e = NULL;
+ 
+-	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
+	l = &eh->table[exception_hash(eh, new_e->old_chunk)].head;
+ 
+ 	/* Add immediately if this table doesn't support consecutive chunks */
+ 	if (!eh->hash_shift)
+ 		goto out;
+ 
+ 	/* List is ordered by old_chunk */
+-	hlist_bl_for_each_entry(e, pos, l, hash_list) {
+	hlist_for_each_entry(e, l, hash_list) {
+ 		/* Insert after an existing chunk? */
+ 		if (new_e->old_chunk == (e->old_chunk +
+ 					 dm_consecutive_chunk_count(e) + 1) &&
+@@ -804,13 +810,13 @@ static void dm_insert_exception(struct dm_exception_table *eh,
+ 		 * Either the table doesn't support consecutive chunks or slot
+ 		 * l is empty.
+ 		 */
+-		hlist_bl_add_head(&new_e->hash_list, l);
+		hlist_add_head(&new_e->hash_list, l);
+ 	} else if (new_e->old_chunk < e->old_chunk) {
+ 		/* Add before an existing exception */
+-		hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
+		hlist_add_before(&new_e->hash_list, &e->hash_list);
+ 	} else {
+ 		/* Add to l's tail: e is the last exception in this slot */
+-		hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
+		hlist_add_behind(&new_e->hash_list, &e->hash_list);
+ 	}
+ }
+ 
+@@ -820,7 +826,6 @@ static void dm_insert_exception(struct dm_exception_table *eh,
+  */
+ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
+ {
+-	struct dm_exception_table_lock lock;
+ 	struct dm_snapshot *s = context;
+ 	struct dm_exception *e;
+ 
+@@ -833,17 +838,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
+ 	/* Consecutive_count is implicitly initialised to zero */
+ 	e->new_chunk = new;
+ 
+-	/*
+-	 * Although there is no need to lock access to the exception tables
+-	 * here, if we don't then hlist_bl_add_head(), called by
+-	 * dm_insert_exception(), will complain about accessing the
+-	 * corresponding list without locking it first.
+-	 */
+-	dm_exception_table_lock_init(s, old, &lock);
+-
+-	dm_exception_table_lock(&lock);
+ 	dm_insert_exception(&s->complete, e);
+-	dm_exception_table_unlock(&lock);
+ 
+ 	return 0;
+ }
+@@ -873,7 +868,7 @@ static int calc_max_buckets(void)
+ 	/* use a fixed size of 2MB */
+ 	unsigned long mem = 2 * 1024 * 1024;
+ 
+-	mem /= sizeof(struct hlist_bl_head);
+	mem /= sizeof(struct dm_hlist_head);
+ 
+ 	return mem;
+ }
+-- 
+2.49.0
+
--- a/kernel-rt/debian/bullseye/patches/series
+++ b/kernel-rt/debian/bullseye/patches/series
@@ -38,3 +38,4 @@ zl3073x-backport/0003-devlink-introduce-devlink_nl_put_u64.patch
 zl3073x-backport/0004-dpll-zl3073x-Fix-missing-header-build-error-on-older.patch
 zl3073x-backport/0005-dpll-add-phase-offset-monitor-feature-to-netlink-spe.patch
 zl3073x-backport/0006-dpll-add-phase_offset_monitor_get-set-callback-ops.patch
+0015-dm-snapshot-fix-scheduling-while-atomic-on-real-time.patch
--- a/kernel-rt/debian/trixie/patches/0015-dm-snapshot-fix-scheduling-while-atomic-on-real-time.patch
+++ b/kernel-rt/debian/trixie/patches/0015-dm-snapshot-fix-scheduling-while-atomic-on-real-time.patch
@@ -0,0 +1,236 @@
+From e94787a61d0f7617801bafaf5e63474683d76cd8 Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Mon, 1 Dec 2025 22:13:10 +0100
+Subject: [PATCH] dm-snapshot: fix 'scheduling while atomic' on real-time
+ kernels
+
+There is reported 'scheduling while atomic' bug when using dm-snapshot on
+real-time kernels. The reason for the bug is that the hlist_bl code does
+preempt_disable() when taking the lock and the kernel attempts to take
+other spinlocks while holding the hlist_bl lock.
+
+Fix this by converting a hlist_bl spinlock into a regular spinlock.
+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Reported-by: Jiping Ma <jiping.ma2@windriver.com>
+(cherry picked from commit 8581b19eb2c5ccf06c195d3b5468c3c9d17a5020)
+Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
+---
+ drivers/md/dm-exception-store.h |  2 +-
+ drivers/md/dm-snap.c            | 73 +++++++++++++++------------------
+ 2 files changed, 35 insertions(+), 40 deletions(-)
+
+diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
+index b67976637538..061b4d310813 100644
+--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
+@@ -29,7 +29,7 @@ typedef sector_t chunk_t;
+  * chunk within the device.
+  */
+ struct dm_exception {
+-	struct hlist_bl_node hash_list;
+	struct hlist_node hash_list;
+ 
+ 	chunk_t old_chunk;
+ 	chunk_t new_chunk;
+diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
+index f40c18da4000..dbd148967de4 100644
+--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
+@@ -40,10 +40,15 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
+ #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
+ 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+ 
+struct dm_hlist_head {
+	struct hlist_head head;
+	spinlock_t lock;
+};
+
+ struct dm_exception_table {
+ 	uint32_t hash_mask;
+ 	unsigned int hash_shift;
+-	struct hlist_bl_head *table;
+	struct dm_hlist_head *table;
+ };
+ 
+ struct dm_snapshot {
+@@ -628,8 +633,8 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
+ 
+ /* Lock to protect access to the completed and pending exception hash tables. */
+ struct dm_exception_table_lock {
+-	struct hlist_bl_head *complete_slot;
+-	struct hlist_bl_head *pending_slot;
+	spinlock_t *complete_slot;
+	spinlock_t *pending_slot;
+ };
+ 
+ static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
+@@ -638,20 +643,20 @@ static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
+ 	struct dm_exception_table *complete = &s->complete;
+ 	struct dm_exception_table *pending = &s->pending;
+ 
+-	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
+-	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
+	lock->complete_slot = &complete->table[exception_hash(complete, chunk)].lock;
+	lock->pending_slot = &pending->table[exception_hash(pending, chunk)].lock;
+ }
+ 
+ static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
+ {
+-	hlist_bl_lock(lock->complete_slot);
+-	hlist_bl_lock(lock->pending_slot);
+	spin_lock_nested(lock->complete_slot, 1);
+	spin_lock_nested(lock->pending_slot, 2);
+ }
+ 
+ static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
+ {
+-	hlist_bl_unlock(lock->pending_slot);
+-	hlist_bl_unlock(lock->complete_slot);
+	spin_unlock(lock->pending_slot);
+	spin_unlock(lock->complete_slot);
+ }
+ 
+ static int dm_exception_table_init(struct dm_exception_table *et,
+@@ -661,13 +666,15 @@ static int dm_exception_table_init(struct dm_exception_table *et,
+ 
+ 	et->hash_shift = hash_shift;
+ 	et->hash_mask = size - 1;
+-	et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head),
+	et->table = kvmalloc_array(size, sizeof(struct dm_hlist_head),
+ 				   GFP_KERNEL);
+ 	if (!et->table)
+ 		return -ENOMEM;
+ 
+-	for (i = 0; i < size; i++)
+-		INIT_HLIST_BL_HEAD(et->table + i);
+	for (i = 0; i < size; i++) {
+		INIT_HLIST_HEAD(&et->table[i].head);
+		spin_lock_init(&et->table[i].lock);
+	}
+ 
+ 	return 0;
+ }
+@@ -675,16 +682,17 @@ static int dm_exception_table_init(struct dm_exception_table *et,
+ static void dm_exception_table_exit(struct dm_exception_table *et,
+ 				    struct kmem_cache *mem)
+ {
+-	struct hlist_bl_head *slot;
+	struct dm_hlist_head *slot;
+ 	struct dm_exception *ex;
+-	struct hlist_bl_node *pos, *n;
+	struct hlist_node *pos;
+ 	int i, size;
+ 
+ 	size = et->hash_mask + 1;
+ 	for (i = 0; i < size; i++) {
+ 		slot = et->table + i;
+ 
+-		hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) {
+		hlist_for_each_entry_safe(ex, pos, &slot->head, hash_list) {
+			hlist_del(&ex->hash_list);
+ 			kmem_cache_free(mem, ex);
+ 			cond_resched();
+ 		}
+@@ -700,7 +708,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
+ 
+ static void dm_remove_exception(struct dm_exception *e)
+ {
+-	hlist_bl_del(&e->hash_list);
+	hlist_del(&e->hash_list);
+ }
+ 
+ /*
+@@ -710,12 +718,11 @@ static void dm_remove_exception(struct dm_exception *e)
+ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
+ 						chunk_t chunk)
+ {
+-	struct hlist_bl_head *slot;
+-	struct hlist_bl_node *pos;
+	struct hlist_head *slot;
+ 	struct dm_exception *e;
+ 
+-	slot = &et->table[exception_hash(et, chunk)];
+-	hlist_bl_for_each_entry(e, pos, slot, hash_list)
+	slot = &et->table[exception_hash(et, chunk)].head;
+	hlist_for_each_entry(e, slot, hash_list)
+ 		if (chunk >= e->old_chunk &&
+ 		    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
+ 			return e;
+@@ -762,18 +769,17 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
+ static void dm_insert_exception(struct dm_exception_table *eh,
+ 				struct dm_exception *new_e)
+ {
+-	struct hlist_bl_head *l;
+-	struct hlist_bl_node *pos;
+	struct hlist_head *l;
+ 	struct dm_exception *e = NULL;
+ 
+-	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
+	l = &eh->table[exception_hash(eh, new_e->old_chunk)].head;
+ 
+ 	/* Add immediately if this table doesn't support consecutive chunks */
+ 	if (!eh->hash_shift)
+ 		goto out;
+ 
+ 	/* List is ordered by old_chunk */
+-	hlist_bl_for_each_entry(e, pos, l, hash_list) {
+	hlist_for_each_entry(e, l, hash_list) {
+ 		/* Insert after an existing chunk? */
+ 		if (new_e->old_chunk == (e->old_chunk +
+ 					 dm_consecutive_chunk_count(e) + 1) &&
+@@ -804,13 +810,13 @@ static void dm_insert_exception(struct dm_exception_table *eh,
+ 		 * Either the table doesn't support consecutive chunks or slot
+ 		 * l is empty.
+ 		 */
+-		hlist_bl_add_head(&new_e->hash_list, l);
+		hlist_add_head(&new_e->hash_list, l);
+ 	} else if (new_e->old_chunk < e->old_chunk) {
+ 		/* Add before an existing exception */
+-		hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
+		hlist_add_before(&new_e->hash_list, &e->hash_list);
+ 	} else {
+ 		/* Add to l's tail: e is the last exception in this slot */
+-		hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
+		hlist_add_behind(&new_e->hash_list, &e->hash_list);
+ 	}
+ }
+ 
+@@ -820,7 +826,6 @@ static void dm_insert_exception(struct dm_exception_table *eh,
+  */
+ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
+ {
+-	struct dm_exception_table_lock lock;
+ 	struct dm_snapshot *s = context;
+ 	struct dm_exception *e;
+ 
+@@ -833,17 +838,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
+ 	/* Consecutive_count is implicitly initialised to zero */
+ 	e->new_chunk = new;
+ 
+-	/*
+-	 * Although there is no need to lock access to the exception tables
+-	 * here, if we don't then hlist_bl_add_head(), called by
+-	 * dm_insert_exception(), will complain about accessing the
+-	 * corresponding list without locking it first.
+-	 */
+-	dm_exception_table_lock_init(s, old, &lock);
+-
+-	dm_exception_table_lock(&lock);
+ 	dm_insert_exception(&s->complete, e);
+-	dm_exception_table_unlock(&lock);
+ 
+ 	return 0;
+ }
+@@ -873,7 +868,7 @@ static int calc_max_buckets(void)
+ 	/* use a fixed size of 2MB */
+ 	unsigned long mem = 2 * 1024 * 1024;
+ 
+-	mem /= sizeof(struct hlist_bl_head);
+	mem /= sizeof(struct dm_hlist_head);
+ 
+ 	return mem;
+ }
+-- 
+2.49.0
+
--- a/kernel-rt/debian/trixie/patches/0015-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
+++ b/kernel-rt/debian/trixie/patches/0015-sched-fair-Block-delayed-tasks-on-throttled-hierarch.patch
@@ -1,97 +0,0 @@
-From 753594c69939fb8fbf6971e7f4858052e0978bf1 Mon Sep 17 00:00:00 2001
-From: Jiping Ma <jiping.ma2@windriver.com>
-Date: Mon, 13 Oct 2025 07:28:52 +0000
-Subject: [PATCH] sched/fair: Block delayed tasks on throttled hierarchy during
- dequeue
-
-Dequeuing a fair task on a throttled hierarchy returns early on
-encountering a throttled cfs_rq since the throttle path has already
-dequeued the hierarchy above and has adjusted the h_nr_* accounting till
-the root cfs_rq.
-
-dequeue_entities() crucially misses calling __block_task() for delayed
-tasks being dequeued on the throttled hierarchies, but this was mostly
-harmless until commit b7ca5743a260 ("sched/core: Tweak
-wait_task_inactive() to force dequeue sched_delayed tasks") since all
-existing cases would re-enqueue the task if task_on_rq_queued() returned
-true and the task would eventually be blocked at pick after the
-hierarchy was unthrottled.
-
-wait_task_inactive() is special as it expects the delayed task on
-throttled hierarchy to reach the blocked state on dequeue but since
-__block_task() is never called, task_on_rq_queued() continues to return
-true. Furthermore, since the task is now off the hierarchy, the pick
-never reaches it to fully block the task even after unthrottle leading
-to wait_task_inactive() looping endlessly.
-
-Remedy this by calling __block_task() if a delayed task is being
-dequeued on a throttled hierarchy.
-
-This fix is only required for stabled kernels implementing delay dequeue
-(>= v6.12) before v6.18 since upstream commit e1fad12dcb66 ("sched/fair:
-Switch to task based throttle model") indirectly fixes this by removing
-the early return conditions in dequeue_entities() as part of the per-task
-throttle feature.
-
-Cc: stable@vger.kernel.org
-Reported-by: Matt Fleming <matt@readmodwrite.com>
-Closes: https://lore.kernel.org/all/20250925133310.1843863-1-matt@readmodwrite.com/
-Fixes: b7ca5743a260 ("sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks")
-Tested-by: Matt Fleming <mfleming@cloudflare.com>
-Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
-Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
---
- kernel/sched/fair.c | 9 ++++++---
- 1 file changed, 6 insertions(+), 3 deletions(-)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index d894ccc8cb8f..64fd09e5bb79 100644
--- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -7187,6 +7187,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
- 	int h_nr_delayed = 0;
- 	struct cfs_rq *cfs_rq;
- 	u64 slice = 0;
-+	int ret = 0;
- 
- 	if (entity_is_task(se)) {
- 		p = task_of(se);
-@@ -7218,7 +7219,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
- 
- 		/* end evaluation on encountering a throttled cfs_rq */
- 		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-+			goto out;
- 
- 		/* Don't dequeue parent if it has other entities besides us */
- 		if (cfs_rq->load.weight) {
-@@ -7261,7 +7262,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
- 
- 		/* end evaluation on encountering a throttled cfs_rq */
- 		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-+			goto out;
- 	}
- 
- 	sub_nr_running(rq, h_nr_queued);
-@@ -7273,6 +7274,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
- 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
- 		rq->next_balance = jiffies;
- 
-+	ret = 1;
-+out:
- 	if (p && task_delayed) {
- 		SCHED_WARN_ON(!task_sleep);
- 		SCHED_WARN_ON(p->on_rq != 1);
-@@ -7288,7 +7291,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
- 		__block_task(rq, p);
- 	}
- 
-	return 1;
-+	return ret;
- }
- 
- /*
-- 
-2.49.0
-
--- a/kernel-rt/debian/trixie/patches/series
+++ b/kernel-rt/debian/trixie/patches/series
@@ -38,3 +38,4 @@ zl3073x-backport/0003-devlink-introduce-devlink_nl_put_u64.patch
 zl3073x-backport/0004-dpll-zl3073x-Fix-missing-header-build-error-on-older.patch
 zl3073x-backport/0005-dpll-add-phase-offset-monitor-feature-to-netlink-spe.patch
 zl3073x-backport/0006-dpll-add-phase_offset_monitor_get-set-callback-ops.patch
+0015-dm-snapshot-fix-scheduling-while-atomic-on-real-time.patch