kernel-rt: beware of __put_task_struct() calling context
Under PREEMPT_RT, __put_task_struct() indirectly acquires sleeping locks. Therefore, it can't be called from an non-preemptible context. Instead of calling __put_task_struct() directly, we defer it using call_rcu(). A more natural approach would use a workqueue, but since in PREEMPT_RT, we can't allocate dynamic memory from atomic context, the code would become more complex because we would need to put the work_struct instance in the task_struct and initialize it when we allocate a new task_struct. We met 5 same panics, __put_task_struct is called during the process holding a lock that caused the kernel BUG_ON. The below is the call trace. We also need cherry pick the following commits, because the necessary context is not in 5.10.18x, such as there is not definition DEFINE_WAIT_OVERRIDE_MAP. * commit 5f2962401c6e ("locking/lockdep: Exclude local_lock_t from IRQ inversions") * commit 175b1a60e880 ("locking/lockdep: Clean up check_redundant() a bit") * commit bc2dd71b2836 ("locking/lockdep: Add a skip() function to __bfs()") * commit 0cce06ba859a ("debugobjects,locking: Annotate debug_object_fill_pool() wait type violation") kernel BUG at kernel/locking/rtmutex.c:1331! invalid opcode: 0000 [#1] PREEMPT_RT SMP NOPTI ...... Call Trace: rt_spin_lock_slowlock_locked+0xb2/0x2a0 ? update_load_avg+0x80/0x690 rt_spin_lock_slowlock+0x50/0x80 ? update_load_avg+0x80/0x690 rt_spin_lock+0x2a/0x30 free_unref_page+0xc5/0x280 __vunmap+0x17f/0x240 put_task_stack+0xc6/0x130 __put_task_struct+0x3d/0x180 rt_mutex_adjust_prio_chain+0x365/0x7b0 task_blocks_on_rt_mutex+0x1eb/0x370 rt_spin_lock_slowlock_locked+0xb2/0x2a0 rt_spin_lock_slowlock+0x50/0x80 rt_spin_lock+0x2a/0x30 free_unref_page_list+0x128/0x5e0 release_pages+0x2b4/0x320 tlb_flush_mmu+0x44/0x150 tlb_finish_mmu+0x3c/0x70 zap_page_range+0x12a/0x170 ? find_vma+0x16/0x70 do_madvise+0x99d/0xba0 ? do_epoll_wait+0xa2/0xe0 ? __x64_sys_madvise+0x26/0x30 __x64_sys_madvise+0x26/0x30 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Verification: - build-pkgs; build-iso; install and boot up on aio-sx lab. - Can not reproduce the isue during the stress-ng test for almost 24 hours. while true; do sudo stress-ng --sched rr --mmapfork 23 -t 20; done while true; do sudo stress-ng --sched fifo--mmapfork 23 -t 20; done Closes-Bug: 2031597 Signed-off-by: Jiping Ma <jiping.ma2@windriver.com> Change-Id: If022441d61492eaec88eede8603a6cb052af99d1
This commit is contained in:
parent
481ad14aa4
commit
b541465cc3
|
@ -0,0 +1,126 @@
|
|||
From 1aafe836ca8f801c0d9b6577ea9d5f598cec13f1 Mon Sep 17 00:00:00 2001
|
||||
From: Wander Lairson Costa <wander@redhat.com>
|
||||
Date: Wed, 14 Jun 2023 09:23:21 -0300
|
||||
Subject: [PATCH 69/74] kernel/fork: beware of __put_task_struct() calling
|
||||
context
|
||||
|
||||
Under PREEMPT_RT, __put_task_struct() indirectly acquires sleeping
|
||||
locks. Therefore, it can't be called from an non-preemptible context.
|
||||
|
||||
One practical example is splat inside inactive_task_timer(), which is
|
||||
called in a interrupt context:
|
||||
|
||||
CPU: 1 PID: 2848 Comm: life Kdump: loaded Tainted: G W ---------
|
||||
Hardware name: HP ProLiant DL388p Gen8, BIOS P70 07/15/2012
|
||||
Call Trace:
|
||||
dump_stack_lvl+0x57/0x7d
|
||||
mark_lock_irq.cold+0x33/0xba
|
||||
mark_lock+0x1e7/0x400
|
||||
mark_usage+0x11d/0x140
|
||||
__lock_acquire+0x30d/0x930
|
||||
lock_acquire.part.0+0x9c/0x210
|
||||
rt_spin_lock+0x27/0xe0
|
||||
refill_obj_stock+0x3d/0x3a0
|
||||
kmem_cache_free+0x357/0x560
|
||||
inactive_task_timer+0x1ad/0x340
|
||||
__run_hrtimer+0x8a/0x1a0
|
||||
__hrtimer_run_queues+0x91/0x130
|
||||
hrtimer_interrupt+0x10f/0x220
|
||||
__sysvec_apic_timer_interrupt+0x7b/0xd0
|
||||
sysvec_apic_timer_interrupt+0x4f/0xd0
|
||||
asm_sysvec_apic_timer_interrupt+0x12/0x20
|
||||
RIP: 0033:0x7fff196bf6f5
|
||||
|
||||
Instead of calling __put_task_struct() directly, we defer it using
|
||||
call_rcu(). A more natural approach would use a workqueue, but since
|
||||
in PREEMPT_RT, we can't allocate dynamic memory from atomic context,
|
||||
the code would become more complex because we would need to put the
|
||||
work_struct instance in the task_struct and initialize it when we
|
||||
allocate a new task_struct.
|
||||
|
||||
The issue is reproducible with stress-ng:
|
||||
|
||||
while true; do
|
||||
stress-ng --sched deadline --sched-period 1000000000 \
|
||||
--sched-runtime 800000000 --sched-deadline \
|
||||
1000000000 --mmapfork 23 -t 20
|
||||
done
|
||||
|
||||
Reported-by: Hu Chunyu <chuhu@redhat.com>
|
||||
Suggested-by: Oleg Nesterov <oleg@redhat.com>
|
||||
Suggested-by: Valentin Schneider <vschneid@redhat.com>
|
||||
Suggested-by: Peter Zijlstra <peterz@infradead.org>
|
||||
Signed-off-by: Wander Lairson Costa <wander@redhat.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20230614122323.37957-2-wander@redhat.com
|
||||
(cherry picked from https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=d243b34459cea30cfe5f3a9b2feb44e7daff9938)
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
include/linux/sched/task.h | 28 +++++++++++++++++++++++++++-
|
||||
kernel/fork.c | 8 ++++++++
|
||||
2 files changed, 35 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
|
||||
index 2832cc6be062..0485fc77edb8 100644
|
||||
--- a/include/linux/sched/task.h
|
||||
+++ b/include/linux/sched/task.h
|
||||
@@ -110,10 +110,36 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
|
||||
}
|
||||
|
||||
extern void __put_task_struct(struct task_struct *t);
|
||||
+extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
|
||||
|
||||
static inline void put_task_struct(struct task_struct *t)
|
||||
{
|
||||
- if (refcount_dec_and_test(&t->usage))
|
||||
+ if (!refcount_dec_and_test(&t->usage))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * under PREEMPT_RT, we can't call put_task_struct
|
||||
+ * in atomic context because it will indirectly
|
||||
+ * acquire sleeping locks.
|
||||
+ *
|
||||
+ * call_rcu() will schedule delayed_put_task_struct_rcu()
|
||||
+ * to be called in process context.
|
||||
+ *
|
||||
+ * __put_task_struct() is called when
|
||||
+ * refcount_dec_and_test(&t->usage) succeeds.
|
||||
+ *
|
||||
+ * This means that it can't "conflict" with
|
||||
+ * put_task_struct_rcu_user() which abuses ->rcu the same
|
||||
+ * way; rcu_users has a reference so task->usage can't be
|
||||
+ * zero after rcu_users 1 -> 0 transition.
|
||||
+ *
|
||||
+ * delayed_free_task() also uses ->rcu, but it is only called
|
||||
+ * when it fails to fork a process. Therefore, there is no
|
||||
+ * way it can conflict with put_task_struct().
|
||||
+ */
|
||||
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
|
||||
+ call_rcu(&t->rcu, __put_task_struct_rcu_cb);
|
||||
+ else
|
||||
__put_task_struct(t);
|
||||
}
|
||||
|
||||
diff --git a/kernel/fork.c b/kernel/fork.c
|
||||
index ffbfef082b3e..a315080180b4 100644
|
||||
--- a/kernel/fork.c
|
||||
+++ b/kernel/fork.c
|
||||
@@ -768,6 +768,14 @@ void __put_task_struct(struct task_struct *tsk)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__put_task_struct);
|
||||
|
||||
+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
|
||||
+{
|
||||
+ struct task_struct *task = container_of(rhp, struct task_struct, rcu);
|
||||
+
|
||||
+ __put_task_struct(task);
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
|
||||
+
|
||||
void __init __weak arch_task_cache_init(void) { }
|
||||
|
||||
/*
|
||||
--
|
||||
2.40.0
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
From e69b50780144737cbb4f359369f7c151d2fbc365 Mon Sep 17 00:00:00 2001
|
||||
From: Wander Lairson Costa <wander@redhat.com>
|
||||
Date: Wed, 14 Jun 2023 09:23:22 -0300
|
||||
Subject: [PATCH 70/74] sched: avoid false lockdep splat in put_task_struct()
|
||||
|
||||
In put_task_struct(), a spin_lock is indirectly acquired under the kernel
|
||||
stock. When running the kernel in real-time (RT) configuration, the
|
||||
operation is dispatched to a preemptible context call to ensure
|
||||
guaranteed preemption. However, if PROVE_RAW_LOCK_NESTING is enabled
|
||||
and __put_task_struct() is called while holding a raw_spinlock, lockdep
|
||||
incorrectly reports an "Invalid lock context" in the stock kernel.
|
||||
|
||||
This false splat occurs because lockdep is unaware of the different
|
||||
route taken under RT. To address this issue, override the inner wait
|
||||
type to prevent the false lockdep splat.
|
||||
|
||||
Suggested-by: Oleg Nesterov <oleg@redhat.com>
|
||||
Suggested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||||
Suggested-by: Peter Zijlstra <peterz@infradead.org>
|
||||
Signed-off-by: Wander Lairson Costa <wander@redhat.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20230614122323.37957-3-wander@redhat.com
|
||||
(cherry picked from https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=893cdaaa3977be6afb3a7f756fbfd7be83f68d8c)
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
include/linux/sched/task.h | 18 ++++++++++++++----
|
||||
1 file changed, 14 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
|
||||
index 0485fc77edb8..82a9fcd553dd 100644
|
||||
--- a/include/linux/sched/task.h
|
||||
+++ b/include/linux/sched/task.h
|
||||
@@ -117,6 +117,19 @@ static inline void put_task_struct(struct task_struct *t)
|
||||
if (!refcount_dec_and_test(&t->usage))
|
||||
return;
|
||||
|
||||
+ /*
|
||||
+ * In !RT, it is always safe to call __put_task_struct().
|
||||
+ * Under RT, we can only call it in preemptible context.
|
||||
+ */
|
||||
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
|
||||
+ static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);
|
||||
+
|
||||
+ lock_map_acquire_try(&put_task_map);
|
||||
+ __put_task_struct(t);
|
||||
+ lock_map_release(&put_task_map);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
/*
|
||||
* under PREEMPT_RT, we can't call put_task_struct
|
||||
* in atomic context because it will indirectly
|
||||
@@ -137,10 +150,7 @@ static inline void put_task_struct(struct task_struct *t)
|
||||
* when it fails to fork a process. Therefore, there is no
|
||||
* way it can conflict with put_task_struct().
|
||||
*/
|
||||
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
|
||||
- call_rcu(&t->rcu, __put_task_struct_rcu_cb);
|
||||
- else
|
||||
- __put_task_struct(t);
|
||||
+ call_rcu(&t->rcu, __put_task_struct_rcu_cb);
|
||||
}
|
||||
|
||||
static inline void put_task_struct_many(struct task_struct *t, int nr)
|
||||
--
|
||||
2.40.0
|
||||
|
|
@ -0,0 +1,146 @@
|
|||
From f50db548598294f22cf44369d1a568774cc04144 Mon Sep 17 00:00:00 2001
|
||||
From: Boqun Feng <boqun.feng@gmail.com>
|
||||
Date: Thu, 10 Dec 2020 11:02:40 +0100
|
||||
Subject: [PATCH 71/74] locking/lockdep: Add a skip() function to __bfs()
|
||||
|
||||
Some __bfs() walks will have additional iteration constraints (beyond
|
||||
the path being strong). Provide an additional function to allow
|
||||
terminating graph walks.
|
||||
|
||||
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
(cherry picked from commit bc2dd71b283665f0a409d5b6fc603d5a6fdc219e)
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
kernel/locking/lockdep.c | 29 +++++++++++++++++++----------
|
||||
1 file changed, 19 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
|
||||
index eb42322d6183..0dd6a6fe48bd 100644
|
||||
--- a/kernel/locking/lockdep.c
|
||||
+++ b/kernel/locking/lockdep.c
|
||||
@@ -1679,6 +1679,7 @@ static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
|
||||
static enum bfs_result __bfs(struct lock_list *source_entry,
|
||||
void *data,
|
||||
bool (*match)(struct lock_list *entry, void *data),
|
||||
+ bool (*skip)(struct lock_list *entry, void *data),
|
||||
struct lock_list **target_entry,
|
||||
int offset)
|
||||
{
|
||||
@@ -1739,7 +1740,12 @@ static enum bfs_result __bfs(struct lock_list *source_entry,
|
||||
/*
|
||||
* Step 3: we haven't visited this and there is a strong
|
||||
* dependency path to this, so check with @match.
|
||||
+ * If @skip is provide and returns true, we skip this
|
||||
+ * lock (and any path this lock is in).
|
||||
*/
|
||||
+ if (skip && skip(lock, data))
|
||||
+ continue;
|
||||
+
|
||||
if (match(lock, data)) {
|
||||
*target_entry = lock;
|
||||
return BFS_RMATCH;
|
||||
@@ -1782,9 +1788,10 @@ static inline enum bfs_result
|
||||
__bfs_forwards(struct lock_list *src_entry,
|
||||
void *data,
|
||||
bool (*match)(struct lock_list *entry, void *data),
|
||||
+ bool (*skip)(struct lock_list *entry, void *data),
|
||||
struct lock_list **target_entry)
|
||||
{
|
||||
- return __bfs(src_entry, data, match, target_entry,
|
||||
+ return __bfs(src_entry, data, match, skip, target_entry,
|
||||
offsetof(struct lock_class, locks_after));
|
||||
|
||||
}
|
||||
@@ -1793,9 +1800,10 @@ static inline enum bfs_result
|
||||
__bfs_backwards(struct lock_list *src_entry,
|
||||
void *data,
|
||||
bool (*match)(struct lock_list *entry, void *data),
|
||||
+ bool (*skip)(struct lock_list *entry, void *data),
|
||||
struct lock_list **target_entry)
|
||||
{
|
||||
- return __bfs(src_entry, data, match, target_entry,
|
||||
+ return __bfs(src_entry, data, match, skip, target_entry,
|
||||
offsetof(struct lock_class, locks_before));
|
||||
|
||||
}
|
||||
@@ -2026,7 +2034,7 @@ static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
|
||||
unsigned long count = 0;
|
||||
struct lock_list *target_entry;
|
||||
|
||||
- __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
|
||||
+ __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry);
|
||||
|
||||
return count;
|
||||
}
|
||||
@@ -2051,7 +2059,7 @@ static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
|
||||
unsigned long count = 0;
|
||||
struct lock_list *target_entry;
|
||||
|
||||
- __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
|
||||
+ __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry);
|
||||
|
||||
return count;
|
||||
}
|
||||
@@ -2079,11 +2087,12 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
|
||||
static noinline enum bfs_result
|
||||
check_path(struct held_lock *target, struct lock_list *src_entry,
|
||||
bool (*match)(struct lock_list *entry, void *data),
|
||||
+ bool (*skip)(struct lock_list *entry, void *data),
|
||||
struct lock_list **target_entry)
|
||||
{
|
||||
enum bfs_result ret;
|
||||
|
||||
- ret = __bfs_forwards(src_entry, target, match, target_entry);
|
||||
+ ret = __bfs_forwards(src_entry, target, match, skip, target_entry);
|
||||
|
||||
if (unlikely(bfs_error(ret)))
|
||||
print_bfs_bug(ret);
|
||||
@@ -2110,7 +2119,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
|
||||
|
||||
debug_atomic_inc(nr_cyclic_checks);
|
||||
|
||||
- ret = check_path(target, &src_entry, hlock_conflict, &target_entry);
|
||||
+ ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry);
|
||||
|
||||
if (unlikely(ret == BFS_RMATCH)) {
|
||||
if (!*trace) {
|
||||
@@ -2159,7 +2168,7 @@ check_redundant(struct held_lock *src, struct held_lock *target)
|
||||
|
||||
debug_atomic_inc(nr_redundant_checks);
|
||||
|
||||
- ret = check_path(target, &src_entry, hlock_equal, &target_entry);
|
||||
+ ret = check_path(target, &src_entry, hlock_equal, NULL, &target_entry);
|
||||
|
||||
if (ret == BFS_RMATCH)
|
||||
debug_atomic_inc(nr_redundant);
|
||||
@@ -2253,7 +2262,7 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
|
||||
|
||||
debug_atomic_inc(nr_find_usage_forwards_checks);
|
||||
|
||||
- result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
|
||||
+ result = __bfs_forwards(root, &usage_mask, usage_match, NULL, target_entry);
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -2270,7 +2279,7 @@ find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
|
||||
|
||||
debug_atomic_inc(nr_find_usage_backwards_checks);
|
||||
|
||||
- result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
|
||||
+ result = __bfs_backwards(root, &usage_mask, usage_match, NULL, target_entry);
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -2739,7 +2748,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
|
||||
*/
|
||||
bfs_init_rootb(&this, prev);
|
||||
|
||||
- ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
|
||||
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL, NULL);
|
||||
if (bfs_error(ret)) {
|
||||
print_bfs_bug(ret);
|
||||
return 0;
|
||||
--
|
||||
2.40.0
|
||||
|
|
@ -0,0 +1,143 @@
|
|||
From 74f3214f75924dfcc8400dc503a547b2669e52a6 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Thu, 10 Dec 2020 11:16:34 +0100
|
||||
Subject: [PATCH 72/74] locking/lockdep: Clean up check_redundant() a bit
|
||||
|
||||
In preparation for adding an TRACE_IRQFLAGS dependent skip function to
|
||||
check_redundant(), move it below the TRACE_IRQFLAGS #ifdef.
|
||||
|
||||
While there, provide a stub function to reduce #ifdef usage.
|
||||
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
(cherry picked from commit 175b1a60e8805617d74aefe17ce0d3a32eceb55c)
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
kernel/locking/lockdep.c | 91 +++++++++++++++++++++-------------------
|
||||
1 file changed, 49 insertions(+), 42 deletions(-)
|
||||
|
||||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
|
||||
index 0dd6a6fe48bd..8689f48893bd 100644
|
||||
--- a/kernel/locking/lockdep.c
|
||||
+++ b/kernel/locking/lockdep.c
|
||||
@@ -2137,46 +2137,6 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
|
||||
return ret;
|
||||
}
|
||||
|
||||
-#ifdef CONFIG_LOCKDEP_SMALL
|
||||
-/*
|
||||
- * Check that the dependency graph starting at <src> can lead to
|
||||
- * <target> or not. If it can, <src> -> <target> dependency is already
|
||||
- * in the graph.
|
||||
- *
|
||||
- * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
|
||||
- * any error appears in the bfs search.
|
||||
- */
|
||||
-static noinline enum bfs_result
|
||||
-check_redundant(struct held_lock *src, struct held_lock *target)
|
||||
-{
|
||||
- enum bfs_result ret;
|
||||
- struct lock_list *target_entry;
|
||||
- struct lock_list src_entry;
|
||||
-
|
||||
- bfs_init_root(&src_entry, src);
|
||||
- /*
|
||||
- * Special setup for check_redundant().
|
||||
- *
|
||||
- * To report redundant, we need to find a strong dependency path that
|
||||
- * is equal to or stronger than <src> -> <target>. So if <src> is E,
|
||||
- * we need to let __bfs() only search for a path starting at a -(E*)->,
|
||||
- * we achieve this by setting the initial node's ->only_xr to true in
|
||||
- * that case. And if <prev> is S, we set initial ->only_xr to false
|
||||
- * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
|
||||
- */
|
||||
- src_entry.only_xr = src->read == 0;
|
||||
-
|
||||
- debug_atomic_inc(nr_redundant_checks);
|
||||
-
|
||||
- ret = check_path(target, &src_entry, hlock_equal, NULL, &target_entry);
|
||||
-
|
||||
- if (ret == BFS_RMATCH)
|
||||
- debug_atomic_inc(nr_redundant);
|
||||
-
|
||||
- return ret;
|
||||
-}
|
||||
-#endif
|
||||
-
|
||||
#ifdef CONFIG_TRACE_IRQFLAGS
|
||||
|
||||
/*
|
||||
@@ -2827,6 +2787,55 @@ static inline int check_irq_usage(struct task_struct *curr,
|
||||
}
|
||||
#endif /* CONFIG_TRACE_IRQFLAGS */
|
||||
|
||||
+#ifdef CONFIG_LOCKDEP_SMALL
|
||||
+/*
|
||||
+ * Check that the dependency graph starting at <src> can lead to
|
||||
+ * <target> or not. If it can, <src> -> <target> dependency is already
|
||||
+ * in the graph.
|
||||
+ *
|
||||
+ * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
|
||||
+ * any error appears in the bfs search.
|
||||
+ */
|
||||
+static noinline enum bfs_result
|
||||
+check_redundant(struct held_lock *src, struct held_lock *target)
|
||||
+{
|
||||
+ enum bfs_result ret;
|
||||
+ struct lock_list *target_entry;
|
||||
+ struct lock_list src_entry;
|
||||
+
|
||||
+ bfs_init_root(&src_entry, src);
|
||||
+ /*
|
||||
+ * Special setup for check_redundant().
|
||||
+ *
|
||||
+ * To report redundant, we need to find a strong dependency path that
|
||||
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
|
||||
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
|
||||
+ * we achieve this by setting the initial node's ->only_xr to true in
|
||||
+ * that case. And if <prev> is S, we set initial ->only_xr to false
|
||||
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
|
||||
+ */
|
||||
+ src_entry.only_xr = src->read == 0;
|
||||
+
|
||||
+ debug_atomic_inc(nr_redundant_checks);
|
||||
+
|
||||
+ ret = check_path(target, &src_entry, hlock_equal, NULL, &target_entry);
|
||||
+
|
||||
+ if (ret == BFS_RMATCH)
|
||||
+ debug_atomic_inc(nr_redundant);
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+#else
|
||||
+
|
||||
+static inline enum bfs_result
|
||||
+check_redundant(struct held_lock *src, struct held_lock *target)
|
||||
+{
|
||||
+ return BFS_RNOMATCH;
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
+
|
||||
static void inc_chains(int irq_context)
|
||||
{
|
||||
if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
|
||||
@@ -3047,7 +3056,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
|
||||
}
|
||||
}
|
||||
|
||||
-#ifdef CONFIG_LOCKDEP_SMALL
|
||||
/*
|
||||
* Is the <prev> -> <next> link redundant?
|
||||
*/
|
||||
@@ -3056,7 +3064,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
|
||||
return 0;
|
||||
else if (ret == BFS_RMATCH)
|
||||
return 2;
|
||||
-#endif
|
||||
|
||||
if (!*trace) {
|
||||
*trace = save_trace();
|
||||
--
|
||||
2.40.0
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
From 7da912796e3f992c6dd9a6cde5eb4aa600536862 Mon Sep 17 00:00:00 2001
|
||||
From: Boqun Feng <boqun.feng@gmail.com>
|
||||
Date: Thu, 10 Dec 2020 11:15:00 +0100
|
||||
Subject: [PATCH 73/74] locking/lockdep: Exclude local_lock_t from IRQ
|
||||
inversions
|
||||
|
||||
The purpose of local_lock_t is to abstract: preempt_disable() /
|
||||
local_bh_disable() / local_irq_disable(). These are the traditional
|
||||
means of gaining access to per-cpu data, but are fundamentally
|
||||
non-preemptible.
|
||||
|
||||
local_lock_t provides a per-cpu lock, that on !PREEMPT_RT reduces to
|
||||
no-ops, just like regular spinlocks do on UP.
|
||||
|
||||
This gives rise to:
|
||||
|
||||
CPU0 CPU1
|
||||
|
||||
local_lock(B) spin_lock_irq(A)
|
||||
<IRQ>
|
||||
spin_lock(A) local_lock(B)
|
||||
|
||||
Where lockdep then figures things will lock up; which would be true if
|
||||
B were any other kind of lock. However this is a false positive, no
|
||||
such deadlock actually exists.
|
||||
|
||||
For !RT the above local_lock(B) is preempt_disable(), and there's
|
||||
obviously no deadlock; alternatively, CPU0's B != CPU1's B.
|
||||
|
||||
For RT the argument is that since local_lock() nests inside
|
||||
spin_lock(), it cannot be used in hardirq context, and therefore CPU0
|
||||
cannot in fact happen. Even though B is a real lock, it is a
|
||||
preemptible lock and any threaded-irq would simply schedule out and
|
||||
let the preempted task (which holds B) continue such that the task on
|
||||
CPU1 can make progress, after which the threaded-irq resumes and can
|
||||
finish.
|
||||
|
||||
This means that we can never form an IRQ inversion on a local_lock
|
||||
dependency, so terminate the graph walk when looking for IRQ
|
||||
inversions when we encounter one.
|
||||
|
||||
One consequence is that (for LOCKDEP_SMALL) when we look for redundant
|
||||
dependencies, A -> B is not redundant in the presence of A -> L -> B.
|
||||
|
||||
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
|
||||
[peterz: Changelog]
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
(cherry picked from commit 5f2962401c6e195222f320d12b3a55377b2d4653)
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
kernel/locking/lockdep.c | 57 +++++++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 53 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
|
||||
index 8689f48893bd..8b5d7b45a9f3 100644
|
||||
--- a/kernel/locking/lockdep.c
|
||||
+++ b/kernel/locking/lockdep.c
|
||||
@@ -2207,6 +2207,44 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
|
||||
return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
|
||||
}
|
||||
|
||||
+static inline bool usage_skip(struct lock_list *entry, void *mask)
|
||||
+{
|
||||
+ /*
|
||||
+ * Skip local_lock() for irq inversion detection.
|
||||
+ *
|
||||
+ * For !RT, local_lock() is not a real lock, so it won't carry any
|
||||
+ * dependency.
|
||||
+ *
|
||||
+ * For RT, an irq inversion happens when we have lock A and B, and on
|
||||
+ * some CPU we can have:
|
||||
+ *
|
||||
+ * lock(A);
|
||||
+ * <interrupted>
|
||||
+ * lock(B);
|
||||
+ *
|
||||
+ * where lock(B) cannot sleep, and we have a dependency B -> ... -> A.
|
||||
+ *
|
||||
+ * Now we prove local_lock() cannot exist in that dependency. First we
|
||||
+ * have the observation for any lock chain L1 -> ... -> Ln, for any
|
||||
+ * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise
|
||||
+ * wait context check will complain. And since B is not a sleep lock,
|
||||
+ * therefore B.inner_wait_type >= 2, and since the inner_wait_type of
|
||||
+ * local_lock() is 3, which is greater than 2, therefore there is no
|
||||
+ * way the local_lock() exists in the dependency B -> ... -> A.
|
||||
+ *
|
||||
+ * As a result, we will skip local_lock(), when we search for irq
|
||||
+ * inversion bugs.
|
||||
+ */
|
||||
+ if (entry->class->lock_type == LD_LOCK_PERCPU) {
|
||||
+ if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
|
||||
+ return false;
|
||||
+
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Find a node in the forwards-direction dependency sub-graph starting
|
||||
* at @root->class that matches @bit.
|
||||
@@ -2222,7 +2260,7 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
|
||||
|
||||
debug_atomic_inc(nr_find_usage_forwards_checks);
|
||||
|
||||
- result = __bfs_forwards(root, &usage_mask, usage_match, NULL, target_entry);
|
||||
+ result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry);
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -2239,7 +2277,7 @@ find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
|
||||
|
||||
debug_atomic_inc(nr_find_usage_backwards_checks);
|
||||
|
||||
- result = __bfs_backwards(root, &usage_mask, usage_match, NULL, target_entry);
|
||||
+ result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry);
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -2708,7 +2746,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
|
||||
*/
|
||||
bfs_init_rootb(&this, prev);
|
||||
|
||||
- ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL, NULL);
|
||||
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL);
|
||||
if (bfs_error(ret)) {
|
||||
print_bfs_bug(ret);
|
||||
return 0;
|
||||
@@ -2785,6 +2823,12 @@ static inline int check_irq_usage(struct task_struct *curr,
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
+
|
||||
+static inline bool usage_skip(struct lock_list *entry, void *mask)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
#endif /* CONFIG_TRACE_IRQFLAGS */
|
||||
|
||||
#ifdef CONFIG_LOCKDEP_SMALL
|
||||
@@ -2818,7 +2862,12 @@ check_redundant(struct held_lock *src, struct held_lock *target)
|
||||
|
||||
debug_atomic_inc(nr_redundant_checks);
|
||||
|
||||
- ret = check_path(target, &src_entry, hlock_equal, NULL, &target_entry);
|
||||
+ /*
|
||||
+ * Note: we skip local_lock() for redundant check, because as the
|
||||
+ * comment in usage_skip(), A -> local_lock() -> B and A -> B are not
|
||||
+ * the same.
|
||||
+ */
|
||||
+ ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry);
|
||||
|
||||
if (ret == BFS_RMATCH)
|
||||
debug_atomic_inc(nr_redundant);
|
||||
--
|
||||
2.40.0
|
||||
|
|
@ -0,0 +1,176 @@
|
|||
From ea37a6b8cfffbd7f76f486ffdb3701816b8ecb01 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Tue, 25 Apr 2023 17:03:13 +0200
|
||||
Subject: [PATCH 74/74] debugobjects,locking: Annotate debug_object_fill_pool()
|
||||
wait type violation
|
||||
|
||||
There is an explicit wait-type violation in debug_object_fill_pool()
|
||||
for PREEMPT_RT=n kernels which allows them to more easily fill the
|
||||
object pool and reduce the chance of allocation failures.
|
||||
|
||||
Lockdep's wait-type checks are designed to check the PREEMPT_RT
|
||||
locking rules even for PREEMPT_RT=n kernels and object to this, so
|
||||
create a lockdep annotation to allow this to stand.
|
||||
|
||||
Specifically, create a 'lock' type that overrides the inner wait-type
|
||||
while it is held -- allowing one to temporarily raise it, such that
|
||||
the violation is hidden.
|
||||
|
||||
Reported-by: Vlastimil Babka <vbabka@suse.cz>
|
||||
Reported-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Tested-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Link: https://lkml.kernel.org/r/20230429100614.GA1489784@hirez.programming.kicks-ass.net
|
||||
(cherry picked from commit 0cce06ba859a515bd06224085d3addb870608b6d)
|
||||
Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
|
||||
---
|
||||
include/linux/lockdep.h | 14 ++++++++++++++
|
||||
include/linux/lockdep_types.h | 1 +
|
||||
kernel/locking/lockdep.c | 28 +++++++++++++++++++++-------
|
||||
lib/debugobjects.c | 15 +++++++++++++--
|
||||
4 files changed, 49 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
|
||||
index 02dab569a2a0..aa8c41c9d44f 100644
|
||||
--- a/include/linux/lockdep.h
|
||||
+++ b/include/linux/lockdep.h
|
||||
@@ -329,6 +329,16 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
|
||||
#define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c))
|
||||
#define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c))
|
||||
|
||||
+/*
|
||||
+ * Must use lock_map_aquire_try() with override maps to avoid
|
||||
+ * lockdep thinking they participate in the block chain.
|
||||
+ */
|
||||
+#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \
|
||||
+ struct lockdep_map _name = { \
|
||||
+ .name = #_name "-wait-type-override", \
|
||||
+ .wait_type_inner = _wait_type, \
|
||||
+ .lock_type = LD_LOCK_WAIT_OVERRIDE, }
|
||||
+
|
||||
#else /* !CONFIG_LOCKDEP */
|
||||
|
||||
static inline void lockdep_init_task(struct task_struct *task)
|
||||
@@ -406,6 +416,9 @@ static inline void lockdep_unregister_key(struct lock_class_key *key)
|
||||
#define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0)
|
||||
#define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0)
|
||||
|
||||
+#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \
|
||||
+ struct lockdep_map __maybe_unused _name = {}
|
||||
+
|
||||
#endif /* !LOCKDEP */
|
||||
|
||||
enum xhlock_context_t {
|
||||
@@ -548,6 +561,7 @@ do { \
|
||||
#define rwsem_release(l, i) lock_release(l, i)
|
||||
|
||||
#define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
|
||||
+#define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
|
||||
#define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
|
||||
#define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
|
||||
#define lock_map_release(l) lock_release(l, _THIS_IP_)
|
||||
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
|
||||
index 2ec9ff5a7fff..92cc151bff3b 100644
|
||||
--- a/include/linux/lockdep_types.h
|
||||
+++ b/include/linux/lockdep_types.h
|
||||
@@ -33,6 +33,7 @@ enum lockdep_wait_type {
|
||||
enum lockdep_lock_type {
|
||||
LD_LOCK_NORMAL = 0, /* normal, catch all */
|
||||
LD_LOCK_PERCPU, /* percpu */
|
||||
+ LD_LOCK_WAIT_OVERRIDE, /* annotation */
|
||||
LD_LOCK_MAX,
|
||||
};
|
||||
|
||||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
|
||||
index 8b5d7b45a9f3..a694db917fae 100644
|
||||
--- a/kernel/locking/lockdep.c
|
||||
+++ b/kernel/locking/lockdep.c
|
||||
@@ -2209,6 +2209,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
|
||||
|
||||
static inline bool usage_skip(struct lock_list *entry, void *mask)
|
||||
{
|
||||
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
|
||||
+ return false;
|
||||
+
|
||||
/*
|
||||
* Skip local_lock() for irq inversion detection.
|
||||
*
|
||||
@@ -2235,14 +2238,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
|
||||
* As a result, we will skip local_lock(), when we search for irq
|
||||
* inversion bugs.
|
||||
*/
|
||||
- if (entry->class->lock_type == LD_LOCK_PERCPU) {
|
||||
- if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
|
||||
- return false;
|
||||
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
|
||||
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
|
||||
+ return false;
|
||||
|
||||
- return true;
|
||||
- }
|
||||
+ /*
|
||||
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
|
||||
+ * a lock and only used to override the wait_type.
|
||||
+ */
|
||||
|
||||
- return false;
|
||||
+ return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4716,7 +4721,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
|
||||
|
||||
for (; depth < curr->lockdep_depth; depth++) {
|
||||
struct held_lock *prev = curr->held_locks + depth;
|
||||
- u8 prev_inner = hlock_class(prev)->wait_type_inner;
|
||||
+ struct lock_class *class = hlock_class(prev);
|
||||
+ u8 prev_inner = class->wait_type_inner;
|
||||
|
||||
if (prev_inner) {
|
||||
/*
|
||||
@@ -4726,6 +4732,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
|
||||
* Also due to trylocks.
|
||||
*/
|
||||
curr_inner = min(curr_inner, prev_inner);
|
||||
+
|
||||
+ /*
|
||||
+ * Allow override for annotations -- this is typically
|
||||
+ * only valid/needed for code that only exists when
|
||||
+ * CONFIG_PREEMPT_RT=n.
|
||||
+ */
|
||||
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
|
||||
+ curr_inner = prev_inner;
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
|
||||
index 9ed09d04f16b..d07cb6bdf0b7 100644
|
||||
--- a/lib/debugobjects.c
|
||||
+++ b/lib/debugobjects.c
|
||||
@@ -594,10 +594,21 @@ static void debug_objects_fill_pool(void)
|
||||
{
|
||||
/*
|
||||
* On RT enabled kernels the pool refill must happen in preemptible
|
||||
- * context:
|
||||
+ * context -- for !RT kernels we rely on the fact that spinlock_t and
|
||||
+ * raw_spinlock_t are basically the same type and this lock-type
|
||||
+ * inversion works just fine.
|
||||
*/
|
||||
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
|
||||
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
|
||||
+ /*
|
||||
+ * Annotate away the spinlock_t inside raw_spinlock_t warning
|
||||
+ * by temporarily raising the wait-type to WAIT_SLEEP, matching
|
||||
+ * the preemptible() condition above.
|
||||
+ */
|
||||
+ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP);
|
||||
+ lock_map_acquire_try(&fill_pool_map);
|
||||
fill_pool();
|
||||
+ lock_map_release(&fill_pool_map);
|
||||
+ }
|
||||
}
|
||||
|
||||
static void
|
||||
--
|
||||
2.40.0
|
||||
|
|
@ -66,3 +66,9 @@
|
|||
0069-perf-x86-rapl-Only-check-lower-32bits-for-RAPL-energ.patch
|
||||
0070-perf-x86-rapl-Fix-psys-energy-event-on-Intel-SPR-pla.patch
|
||||
0071-perf-x86-rapl-Use-standard-Energy-Unit-for-SPR-Dram-.patch
|
||||
0072-kernel-fork-beware-of-__put_task_struct-calling-cont.patch
|
||||
0073-sched-avoid-false-lockdep-splat-in-put_task_struct.patch
|
||||
0074-locking-lockdep-Add-a-skip-function-to-__bfs.patch
|
||||
0075-locking-lockdep-Clean-up-check_redundant-a-bit.patch
|
||||
0076-locking-lockdep-Exclude-local_lock_t-from-IRQ-invers.patch
|
||||
0077-debugobjects-locking-Annotate-debug_object_fill_pool.patch
|
||||
|
|
Loading…
Reference in New Issue