From 4087e16b033140cf2ce509ec23503bddec818a16 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 14 Feb 2025 16:07:46 +0100 Subject: [PATCH 01/16] x86/locking: Use ALT_OUTPUT_SP() for percpu_{,try_}cmpxchg{64,128}_op() percpu_{,try_}cmpxchg{64,128}() macros use CALL instruction inside asm statement in one of their alternatives. Use ALT_OUTPUT_SP() macro to add required dependence on %esp register. ALT_OUTPUT_SP() implements the above dependence by adding ASM_CALL_CONSTRAINT to its arguments. This constraint should be used for any inline asm which has a CALL instruction, otherwise the compiler may schedule the asm before the frame pointer gets set up by the containing function, causing objtool to print a "call without frame pointer save/setup" warning. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250214150929.5780-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e525cd85f999..0ab991fba7de 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -350,9 +350,9 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high)) \ : "b" (new__.low), \ "c" (new__.high), \ "S" (&(_var)) \ @@ -381,10 +381,10 @@ do { \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ CC_SET(z) \ - : CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high)) \ : "b" (new__.low), \ "c" (new__.high), \ "S" (&(_var)) \ @@ -421,9 +421,9 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high)) \ : "b" (new__.low), \ "c" (new__.high), \ "S" (&(_var)) \ @@ -452,10 +452,10 @@ do { \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ CC_SET(z) \ - : CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high)) \ : "b" (new__.low), \ "c" (new__.high), \ "S" (&(_var)) \ From 2d352ec9fcb5d965318e7855b2406a7a14e9ae13 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 14 Feb 2025 16:07:47 +0100 Subject: [PATCH 02/16] x86/locking: Use asm_inline for {,try_}cmpxchg{64,128} emulations According to: https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html the usage of asm pseudo directives in the asm template can confuse the compiler to wrongly estimate the size of the generated code. The ALTERNATIVE macro expands to several asm pseudo directives, so its usage in {,try_}cmpxchg{64,128} causes instruction length estimate to fail by an order of magnitude (the specially instrumented compiler reports the estimated length of these asm templates to be more than 20 instructions long). This incorrect estimate further causes unoptimal inlining decisions, unoptimal instruction scheduling and unoptimal code block alignments for functions that use these locking primitives. Use asm_inline instead: https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html which is a feature that makes GCC pretend some inline assembler code is tiny (while it would think it is huge), instead of just asm. For code size estimation, the size of the asm is then taken as the minimum size of one instruction, ignoring how many instructions compiler thinks it is. The effect of this patch on x86_64 target is minor, since 128-bit functions are rarely used on this target. The code size of the resulting defconfig object file stays the same: text data bss dec hex filename 27456612 4638523 814148 32909283 1f627e3 vmlinux-old.o 27456612 4638523 814148 32909283 1f627e3 vmlinux-new.o but the patch has minor effect on code layout due to the different scheduling decisions in functions containing changed macros. There is no effect on the x64_32 target, the code size of the resulting defconfig object file and the code layout stays the same: text data bss dec hex filename 18883870 2679275 1707916 23271061 1631695 vmlinux-old.o 18883870 2679275 1707916 23271061 1631695 vmlinux-new.o Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250214150929.5780-2-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 32 +++++++------ arch/x86/include/asm/percpu.h | 77 +++++++++++++++---------------- 2 files changed, 55 insertions(+), 54 deletions(-) diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fd1282a783dd..95b5f990ca88 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -91,12 +91,14 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, union __u64_halves o = { .full = (_old), }, \ n = { .full = (_new), }; \ \ - asm volatile(ALTERNATIVE(_lock_loc \ - "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ - : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ - : "memory"); \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ \ o.full; \ }) @@ -119,14 +121,16 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 n = { .full = (_new), }; \ bool ret; \ \ - asm volatile(ALTERNATIVE(_lock_loc \ - "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - CC_SET(e) \ - : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ - "+a" (o.low), "+d" (o.high)) \ - : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ - : "memory"); \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ + "+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ \ if (unlikely(!ret)) \ *(_oldp) = o.full; \ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 0ab991fba7de..08f5f61690b7 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -348,15 +348,14 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ - "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high)) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -378,17 +377,16 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ - "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - CC_SET(z) \ - : ALT_OUTPUT_SP(CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high)) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ \ @@ -419,15 +417,14 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ - "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high)) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -449,19 +446,19 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ - "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - CC_SET(z) \ - : ALT_OUTPUT_SP(CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high)) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ + \ likely(success); \ }) From 337369f8ce9e20226402cf139c4f0d3ada7d1705 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Sun, 26 Jan 2025 11:32:43 +0800 Subject: [PATCH 03/16] locking/mutex: Add MUTEX_WARN_ON() into fast path Scenario: In platform_device_register, the driver misuses struct device as platform_data, making kmemdup duplicate a device. Accessing the duplicate may cause list corruption due to its mutex magic or list holding old content. It recurs randomly as the first mutex - getting process skips the slow path and mutex check. Adding MUTEX_WARN_ON(lock->magic!= lock) in __mutex_trylock_fast() makes it always happen. Signed-off-by: Yunhui Cui Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250126033243.53069-1-cuiyunhui@bytedance.com --- kernel/locking/mutex.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b36f23de48f1..19b636f60a24 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -143,6 +143,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; + MUTEX_WARN_ON(lock->magic != lock); + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; From 023f3290b02552ea006c1a2013e373750d2cbff6 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 28 Feb 2025 09:51:15 +0100 Subject: [PATCH 04/16] x86/locking: Remove semicolon from "lock" prefix Minimum version of binutils required to compile the kernel is 2.25. This version correctly handles the "lock" prefix, so it is possible to remove the semicolon, which was used to support ancient versions of GNU as. Due to the semicolon, the compiler considers "lock; insn" as two separate instructions. Removing the semicolon makes asm length calculations more accurate, consequently making scheduling and inlining decisions of the compiler more accurate. Removing the semicolon also enables assembler checks involving lock prefix. Trying to assemble e.g. "lock andl %eax, %ebx" results in: Error: expecting lockable instruction after `lock' Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250228085149.2478245-1-ubizjak@gmail.com --- arch/x86/include/asm/alternative.h | 2 +- arch/x86/include/asm/barrier.h | 8 ++++---- arch/x86/include/asm/cmpxchg.h | 4 ++-- arch/x86/include/asm/cmpxchg_32.h | 4 ++-- arch/x86/include/asm/edac.h | 2 +- arch/x86/include/asm/sync_bitops.h | 12 ++++++------ 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e3903b731305..3b3d3aa19acd 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,7 +48,7 @@ ".popsection\n" \ "671:" -#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " +#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock " #else /* ! CONFIG_SMP */ #define LOCK_PREFIX_HERE "" diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 7b44b3c4cce1..db70832232d4 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -12,11 +12,11 @@ */ #ifdef CONFIG_X86_32 -#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \ +#define mb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "mfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \ +#define rmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "lfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ +#define wmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else #define __mb() asm volatile("mfence":::"memory") @@ -50,7 +50,7 @@ #define __dma_rmb() barrier() #define __dma_wmb() barrier() -#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") +#define __smp_mb() asm volatile("lock addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 5612648b0202..fd8afc1f5f6b 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -134,7 +134,7 @@ extern void __add_wrong_size(void) __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) #define __sync_cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + __raw_cmpxchg((ptr), (old), (new), (size), "lock ") #define __cmpxchg_local(ptr, old, new, size) \ __raw_cmpxchg((ptr), (old), (new), (size), "") @@ -222,7 +222,7 @@ extern void __add_wrong_size(void) __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) #define __sync_try_cmpxchg(ptr, pold, new, size) \ - __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ") + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock ") #define __try_cmpxchg_local(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), "") diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 95b5f990ca88..8806c646d452 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -105,7 +105,7 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; "); + return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock "); } #define arch_cmpxchg64 arch_cmpxchg64 @@ -140,7 +140,7 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; "); + return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock "); } #define arch_try_cmpxchg64 arch_try_cmpxchg64 diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h index 426fc53ff803..dfbd1ebb9f10 100644 --- a/arch/x86/include/asm/edac.h +++ b/arch/x86/include/asm/edac.h @@ -13,7 +13,7 @@ static inline void edac_atomic_scrub(void *va, u32 size) * are interrupt, DMA and SMP safe. */ for (i = 0; i < size / 4; i++, virt_addr++) - asm volatile("lock; addl $0, %0"::"m" (*virt_addr)); + asm volatile("lock addl $0, %0"::"m" (*virt_addr)); } #endif /* _ASM_X86_EDAC_H */ diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 6d8d6bc183b7..cd21a0405ac5 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -31,7 +31,7 @@ */ static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(bts) " %1,%0" + asm volatile("lock " __ASM_SIZE(bts) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -49,7 +49,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(btr) " %1,%0" + asm volatile("lock " __ASM_SIZE(btr) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -66,7 +66,7 @@ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) */ static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(btc) " %1,%0" + asm volatile("lock " __ASM_SIZE(btc) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -82,7 +82,7 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) */ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(bts), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -95,7 +95,7 @@ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btr), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -108,7 +108,7 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btc), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btc), *addr, c, "Ir", nr); } #define sync_test_bit(nr, addr) test_bit(nr, addr) From a1b65f3f7c6f7f0a08a7dba8be458c6415236487 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:10 +0100 Subject: [PATCH 05/16] lockdep/mm: Fix might_fault() lockdep check of current->mm->mmap_lock Turns out that this commit, about 10 years ago: 9ec23531fd48 ("sched/preempt, mm/fault: Trigger might_sleep() in might_fault() with disabled pagefaults") ... accidentally (and unnessecarily) put the lockdep part of __might_fault() under CONFIG_DEBUG_ATOMIC_SLEEP=y. This is potentially notable because large distributions such as Ubuntu are running with !CONFIG_DEBUG_ATOMIC_SLEEP. Restore the debug check. [ mingo: Update changelog. ] Fixes: 9ec23531fd48 ("sched/preempt, mm/fault: Trigger might_sleep() in might_fault() with disabled pagefaults") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Andrew Morton Link: https://lore.kernel.org/r/20241104135517.536628371@infradead.org --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 539c0f7c6d54..1dfad4536cfe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6835,10 +6835,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif From c929d08df8bee855528b9d15b853c892c54e1eee Mon Sep 17 00:00:00 2001 From: Maksim Davydov Date: Wed, 15 Jan 2025 16:17:04 +0300 Subject: [PATCH 06/16] x86/split_lock: Fix the delayed detection logic If the warning mode with disabled mitigation mode is used, then on each CPU where the split lock occurred detection will be disabled in order to make progress and delayed work will be scheduled, which then will enable detection back. Now it turns out that all CPUs use one global delayed work structure. This leads to the fact that if a split lock occurs on several CPUs at the same time (within 2 jiffies), only one CPU will schedule delayed work, but the rest will not. The return value of schedule_delayed_work_on() would have shown this, but it is not checked in the code. A diagram that can help to understand the bug reproduction: - sld_update_msr() enables/disables SLD on both CPUs on the same core - schedule_delayed_work_on() internally checks WORK_STRUCT_PENDING_BIT. If a work has the 'pending' status, then schedule_delayed_work_on() will return an error code and, most importantly, the work will not be placed in the workqueue. Let's say we have a multicore system on which split_lock_mitigate=0 and a multithreaded application is running that calls splitlock in multiple threads. Due to the fact that sld_update_msr() affects the entire core (both CPUs), we will consider 2 CPUs from different cores. Let the 2 threads of this application schedule to CPU0 (core 0) and to CPU 2 (core 1), then: | || | | CPU 0 (core 0) || CPU 2 (core 1) | |_________________________________||___________________________________| | || | | 1) SPLIT LOCK occured || | | || | | 2) split_lock_warn() || | | || | | 3) sysctl_sld_mitigate == 0 || | | (work = &sl_reenable) || | | || | | 4) schedule_delayed_work_on() || | | (reenable will be called || | | after 2 jiffies on CPU 0) || | | || | | 5) disable SLD for core 0 || | | || | | ------------------------- || | | || | | || 6) SPLIT LOCK occured | | || | | || 7) split_lock_warn() | | || | | || 8) sysctl_sld_mitigate == 0 | | || (work = &sl_reenable, | | || the same address as in 3) ) | | || | | 2 jiffies || 9) schedule_delayed_work_on() | | || fials because the work is in | | || the pending state since 4). | | || The work wasn't placed to the | | || workqueue. reenable won't be | | || called on CPU 2 | | || | | || 10) disable SLD for core 0 | | || | | || From now on SLD will | | || never be reenabled on core 1 | | || | | ------------------------- || | | || | | 11) enable SLD for core 0 by || | | __split_lock_reenable || | | || | If the application threads can be scheduled to all processor cores, then over time there will be only one core left, on which SLD will be enabled and split lock will be able to be detected; and on all other cores SLD will be disabled all the time. Most likely, this bug has not been noticed for so long because sysctl_sld_mitigate default value is 1, and in this case a semaphore is used that does not allow 2 different cores to have SLD disabled at the same time, that is, strictly only one work is placed in the workqueue. In order to fix the warning mode with disabled mitigation mode, delayed work has to be per-CPU. Implement it. Fixes: 727209376f49 ("x86/split_lock: Add sysctl to control the misery mode") Signed-off-by: Maksim Davydov Signed-off-by: Ingo Molnar Tested-by: Guilherme G. Piccoli Cc: Thomas Gleixner Cc: Ravi Bangoria Cc: Tom Lendacky Link: https://lore.kernel.org/r/20250115131704.132609-1-davydov-max@yandex-team.ru --- arch/x86/kernel/cpu/bus_lock.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 6cba85c79d42..97222efb4d2a 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -192,7 +192,13 @@ static void __split_lock_reenable(struct work_struct *work) { sld_update_msr(true); } -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); +/* + * In order for each CPU to schedule its delayed work independently of the + * others, delayed work struct must be per-CPU. This is not required when + * sysctl_sld_mitigate is enabled because of the semaphore that limits + * the number of simultaneously scheduled delayed works to 1. + */ +static DEFINE_PER_CPU(struct delayed_work, sl_reenable); /* * If a CPU goes offline with pending delayed work to re-enable split lock @@ -213,7 +219,7 @@ static int splitlock_cpu_offline(unsigned int cpu) static void split_lock_warn(unsigned long ip) { - struct delayed_work *work; + struct delayed_work *work = NULL; int cpu; if (!current->reported_split_lock) @@ -235,11 +241,17 @@ static void split_lock_warn(unsigned long ip) if (down_interruptible(&buslock_sem) == -EINTR) return; work = &sl_reenable_unlock; - } else { - work = &sl_reenable; } cpu = get_cpu(); + + if (!work) { + work = this_cpu_ptr(&sl_reenable); + /* Deferred initialization of per-CPU struct */ + if (!work->work.func) + INIT_DELAYED_WORK(work, __split_lock_reenable); + } + schedule_delayed_work_on(cpu, work, 2); /* Disable split lock detection on this CPU to make progress */ From b76b44fb656100fcd8482d9102d35d1b6a5129e9 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 7 Mar 2025 15:26:53 -0800 Subject: [PATCH 07/16] locking/lock_events: Add locking events for rtmutex slow paths Add locking events for rtlock_slowlock() and rt_mutex_slowlock() for profiling the slow path behavior of rt_spin_lock() and rt_mutex_lock(). Signed-off-by: Waiman Long Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250307232717.1759087-4-boqun.feng@gmail.com --- kernel/locking/lock_events_list.h | 21 +++++++++++++++++++++ kernel/locking/rtmutex.c | 29 ++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 97fb6f3f840a..80b11f194c9f 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -67,3 +67,24 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ + +/* + * Locking events for rtlock_slowlock() + */ +LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */ +LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */ +LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */ + +/* + * Locking events for rt_mutex_slowlock() + */ +LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */ +LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */ +LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */ +LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ +LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ +LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4a8df1800cbb..c80902eacd79 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -27,6 +27,7 @@ #include #include "rtmutex_common.h" +#include "lock_events.h" #ifndef WW_RT # define build_ww_mutex() (false) @@ -1612,10 +1613,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) + if (try_to_take_rt_mutex(lock, current, waiter)) { + lockevent_inc(rtmutex_slow_acq3); break; + } if (timeout && !timeout->task) { ret = -ETIMEDOUT; @@ -1638,8 +1642,10 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { + lockevent_inc(rtmutex_slow_sleep); rt_mutex_schedule(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1694,6 +1700,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtmutex_slowlock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { @@ -1701,6 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq1); return 0; } @@ -1719,10 +1727,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq2); } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); + lockevent_inc(rtmutex_deadlock); } /* @@ -1751,6 +1761,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); + lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); return ret; } @@ -1823,9 +1834,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtlock_slowlock); - if (try_to_take_rt_mutex(lock, current, NULL)) + if (try_to_take_rt_mutex(lock, current, NULL)) { + lockevent_inc(rtlock_slow_acq1); return; + } rt_mutex_init_rtlock_waiter(&waiter); @@ -1838,8 +1852,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, for (;;) { /* Try to acquire the lock again */ - if (try_to_take_rt_mutex(lock, current, &waiter)) + if (try_to_take_rt_mutex(lock, current, &waiter)) { + lockevent_inc(rtlock_slow_acq2); break; + } if (&waiter == rt_mutex_top_waiter(lock)) owner = rt_mutex_owner(lock); @@ -1847,8 +1863,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { + lockevent_inc(rtlock_slow_sleep); schedule_rtlock(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_RTLOCK_WAIT); @@ -1865,6 +1883,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); + lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) From a94d32446ab5e85c7ba97e48a8c9bd85be4fe889 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 7 Mar 2025 15:26:54 -0800 Subject: [PATCH 08/16] locking/lock_events: Add locking events for lockdep Add some lock events to lockdep to profile its behavior. Signed-off-by: Waiman Long Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250307232717.1759087-5-boqun.feng@gmail.com --- kernel/locking/lock_events_list.h | 7 +++++++ kernel/locking/lockdep.c | 8 +++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 80b11f194c9f..9ef9850aeebe 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -88,3 +88,10 @@ LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ + +/* + * Locking events for lockdep + */ +LOCK_EVENT(lockdep_acquire) +LOCK_EVENT(lockdep_lock) +LOCK_EVENT(lockdep_nocheck) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4470680f0226..8436f017c74d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -61,6 +61,7 @@ #include #include "lockdep_internals.h" +#include "lock_events.h" #include @@ -170,6 +171,7 @@ static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { lockdep_lock(); + lockevent_inc(lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -5091,8 +5093,12 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(lock->key == &__lockdep_no_track__)) return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) + lockevent_inc(lockdep_acquire); + + if (!prove_locking || lock->key == &__lockdep_no_validate__) { check = 0; + lockevent_inc(lockdep_nocheck); + } if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; From ee57ab5a32129f599ee1d358548dbebcb5e45953 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 7 Mar 2025 15:26:55 -0800 Subject: [PATCH 09/16] locking/lockdep: Disable KASAN instrumentation of lockdep.c Both KASAN and LOCKDEP are commonly enabled in building a debug kernel. Each of them can significantly slow down the speed of a debug kernel. Enabling KASAN instrumentation of the LOCKDEP code will further slow things down. Since LOCKDEP is a high overhead debugging tool, it will never get enabled in a production kernel. The LOCKDEP code is also pretty mature and is unlikely to get major changes. There is also a possibility of recursion similar to KCSAN. To evaluate the performance impact of disabling KASAN instrumentation of lockdep.c, the time to do a parallel build of the Linux defconfig kernel was used as the benchmark. Two x86-64 systems (Skylake & Zen 2) and an arm64 system were used as test beds. Two sets of non-RT and RT kernels with similar configurations except mainly CONFIG_PREEMPT_RT were used for evaluation. For the Skylake system: Kernel Run time Sys time ------ -------- -------- Non-debug kernel (baseline) 0m47.642s 4m19.811s [CONFIG_KASAN_INLINE=y] Debug kernel 2m11.108s (x2.8) 38m20.467s (x8.9) Debug kernel (patched) 1m49.602s (x2.3) 31m28.501s (x7.3) Debug kernel (patched + mitigations=off) 1m30.988s (x1.9) 26m41.993s (x6.2) RT kernel (baseline) 0m54.871s 7m15.340s [CONFIG_KASAN_INLINE=n] RT debug kernel 6m07.151s (x6.7) 135m47.428s (x18.7) RT debug kernel (patched) 3m42.434s (x4.1) 74m51.636s (x10.3) RT debug kernel (patched + mitigations=off) 2m40.383s (x2.9) 57m54.369s (x8.0) [CONFIG_KASAN_INLINE=y] RT debug kernel 3m22.155s (x3.7) 77m53.018s (x10.7) RT debug kernel (patched) 2m36.700s (x2.9) 54m31.195s (x7.5) RT debug kernel (patched + mitigations=off) 2m06.110s (x2.3) 45m49.493s (x6.3) For the Zen 2 system: Kernel Run time Sys time ------ -------- -------- Non-debug kernel (baseline) 1m42.806s 39m48.714s [CONFIG_KASAN_INLINE=y] Debug kernel 4m04.524s (x2.4) 125m35.904s (x3.2) Debug kernel (patched) 3m56.241s (x2.3) 127m22.378s (x3.2) Debug kernel (patched + mitigations=off) 2m38.157s (x1.5) 92m35.680s (x2.3) RT kernel (baseline) 1m51.500s 14m56.322s [CONFIG_KASAN_INLINE=n] RT debug kernel 16m04.962s (x8.7) 244m36.463s (x16.4) RT debug kernel (patched) 9m09.073s (x4.9) 129m28.439s (x8.7) RT debug kernel (patched + mitigations=off) 3m31.662s (x1.9) 51m01.391s (x3.4) For the arm64 system: Kernel Run time Sys time ------ -------- -------- Non-debug kernel (baseline) 1m56.844s 8m47.150s Debug kernel 3m54.774s (x2.0) 92m30.098s (x10.5) Debug kernel (patched) 3m32.429s (x1.8) 77m40.779s (x8.8) RT kernel (baseline) 4m01.641s 18m16.777s [CONFIG_KASAN_INLINE=n] RT debug kernel 19m32.977s (x4.9) 304m23.965s (x16.7) RT debug kernel (patched) 16m28.354s (x4.1) 234m18.149s (x12.8) Turning the mitigations off doesn't seems to have any noticeable impact on the performance of the arm64 system. So the mitigation=off entries aren't included. For the x86 CPUs, CPU mitigations has a much bigger impact on performance, especially the RT debug kernel with CONFIG_KASAN_INLINE=n. The SRSO mitigation in Zen 2 has an especially big impact on the debug kernel. It is also the majority of the slowdown with mitigations on. It is because the patched RET instruction slows down function returns. A lot of helper functions that are normally compiled out or inlined may become real function calls in the debug kernel. With !CONFIG_KASAN_INLINE, the KASAN instrumentation inserts a lot of __asan_loadX*() and __kasan_check_read() function calls to memory access portion of the code. The lockdep's __lock_acquire() function, for instance, has 66 __asan_loadX*() and 6 __kasan_check_read() calls added with KASAN instrumentation. Of course, the actual numbers may vary depending on the compiler used and the exact version of the lockdep code. With the Skylake test system, the parallel kernel build times reduction of the RT debug kernel with this patch are: CONFIG_KASAN_INLINE=n: -37% CONFIG_KASAN_INLINE=y: -22% The time reduction is less with CONFIG_KASAN_INLINE=y, but it is still significant. Setting CONFIG_KASAN_INLINE=y can result in a significant performance improvement. The major drawback is a significant increase in the size of kernel text. In the case of vmlinux, its text size increases from 45997948 to 67606807. That is a 47% size increase (about 21 Mbytes). The size increase of other kernel modules should be similar. With the newly added rtmutex and lockdep lock events, the relevant event counts for the test runs with the Skylake system were: Event type Debug kernel RT debug kernel ---------- ------------ --------------- lockdep_acquire 1,968,663,277 5,425,313,953 rtlock_slowlock - 401,701,156 rtmutex_slowlock - 139,672 The __lock_acquire() calls in the RT debug kernel are x2.8 times of the non-RT debug kernel with the same workload. Since the __lock_acquire() function is a big hitter in term of performance slowdown, this makes the RT debug kernel much slower than the non-RT one. The average lock nesting depth is likely to be higher in the RT debug kernel too leading to longer execution time in the __lock_acquire() function. As the small advantage of enabling KASAN instrumentation to catch potential memory access error in the lockdep debugging tool is probably not worth the drawback of further slowing down a debug kernel, disable KASAN instrumentation in the lockdep code to allow the debug kernels to regain some performance back, especially for the RT debug kernels. Signed-off-by: Waiman Long Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250307232717.1759087-6-boqun.feng@gmail.com --- kernel/locking/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d17b8..a114949eeed5 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. +KASAN_SANITIZE_lockdep.o := n KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER From de4b59d652646cf00cf632174348ca2266099edc Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 7 Mar 2025 15:26:56 -0800 Subject: [PATCH 10/16] locking/lockdep: Add kasan_check_byte() check in lock_acquire() KASAN instrumentation of lockdep has been disabled, as we don't need KASAN to check the validity of lockdep internal data structures and incur unnecessary performance overhead. However, the lockdep_map pointer passed in externally may not be valid (e.g. use-after-free) and we run the risk of using garbage data resulting in false lockdep reports. Add kasan_check_byte() call in lock_acquire() for non kernel core data object to catch invalid lockdep_map and print out a KASAN report before any lockdep splat, if any. Suggested-by: Marco Elver Signed-off-by: Waiman Long Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Link: https://lore.kernel.org/r/20250214195242.2480920-1-longman@redhat.com Link: https://lore.kernel.org/r/20250307232717.1759087-7-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8436f017c74d..b15757e63626 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,6 +57,7 @@ #include #include #include +#include #include @@ -5830,6 +5831,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!debug_locks) return; + /* + * As KASAN instrumentation is disabled and lock_acquire() is usually + * the first lockdep call when a task tries to acquire a lock, add + * kasan_check_byte() here to check for use-after-free and other + * memory errors. + */ + kasan_check_byte(lock); + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { From 8f65291dae0e75941cf76e427088e5612c4d692e Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 7 Mar 2025 15:26:57 -0800 Subject: [PATCH 11/16] rust: sync: Add accessor for the lock behind a given guard In order to assert a particular `Guard` is associated with a particular `Lock`, add an accessor to obtain a reference to the underlying `Lock` of a `Guard`. Binder needs this assertion to ensure unsafe list operations are done with the correct lock held. [Boqun: Capitalize the title and reword the commit log] Signed-off-by: Alice Ryhl Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Reviewed-by: Fiona Behrens Link: https://lore.kernel.org/r/20250205-guard-get-lock-v2-1-ba32a8c1d5b7@google.com Link: https://lore.kernel.org/r/20250307232717.1759087-8-boqun.feng@gmail.com --- rust/kernel/sync/lock.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index eb80048e0110..8e7e6d5f61e4 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -199,7 +199,12 @@ pub struct Guard<'a, T: ?Sized, B: Backend> { // SAFETY: `Guard` is sync when the data protected by the lock is also sync. unsafe impl Sync for Guard<'_, T, B> {} -impl Guard<'_, T, B> { +impl<'a, T: ?Sized, B: Backend> Guard<'a, T, B> { + /// Returns the lock that this guard originates from. + pub fn lock_ref(&self) -> &'a Lock { + self.lock + } + pub(crate) fn do_unlocked(&mut self, cb: impl FnOnce() -> U) -> U { // SAFETY: The caller owns the lock, so it is safe to unlock it. unsafe { B::unlock(self.lock.state.get(), &self.state) }; From c2849afafd08a1c3ad881129de48ebe1642f7675 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Mar 2025 15:26:58 -0800 Subject: [PATCH 12/16] rust: sync: lock: Add an example for Guard:: Lock_ref() To provide examples on usage of `Guard::lock_ref()` along with the unit test, an "assert a lock is held by a guard" example is added. (Also apply feedback from Benno.) Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Reviewed-by: Benno Lossin Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250223072114.3715-1-boqun.feng@gmail.com Link: https://lore.kernel.org/r/20250307232717.1759087-9-boqun.feng@gmail.com --- rust/kernel/sync/lock.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index 8e7e6d5f61e4..f53e87d04cd2 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -201,6 +201,30 @@ unsafe impl Sync for Guard<'_, T, B> {} impl<'a, T: ?Sized, B: Backend> Guard<'a, T, B> { /// Returns the lock that this guard originates from. + /// + /// # Examples + /// + /// The following example shows how to use [`Guard::lock_ref()`] to assert the corresponding + /// lock is held. + /// + /// ``` + /// # use kernel::{new_spinlock, stack_pin_init, sync::lock::{Backend, Guard, Lock}}; + /// + /// fn assert_held(guard: &Guard<'_, T, B>, lock: &Lock) { + /// // Address-equal means the same lock. + /// assert!(core::ptr::eq(guard.lock_ref(), lock)); + /// } + /// + /// // Creates a new lock on the stack. + /// stack_pin_init!{ + /// let l = new_spinlock!(42) + /// } + /// + /// let g = l.lock(); + /// + /// // `g` originates from `l`. + /// assert_held(&g, &l); + /// ``` pub fn lock_ref(&self) -> &'a Lock { self.lock } From 70b9c8563c9c29102a785d4822f0d77d33fee808 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 7 Mar 2025 15:26:59 -0800 Subject: [PATCH 13/16] rust: sync: condvar: Add wait_interruptible_freezable() To support waiting for a `CondVar` as a freezable process, add a wait_interruptible_freezable() function. Binder needs this function in the appropriate places to freeze a process where some of its threads are blocked on the Binder driver. [ Boqun: Cleaned up the changelog and documentation. ] Signed-off-by: Alice Ryhl Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250307232717.1759087-10-boqun.feng@gmail.com --- rust/kernel/sync/condvar.rs | 23 ++++++++++++++++++++++- rust/kernel/task.rs | 2 ++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index 7df565038d7d..5c1e546a26c3 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -11,7 +11,9 @@ use crate::{ init::PinInit, pin_init, str::CStr, - task::{MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE, TASK_NORMAL, TASK_UNINTERRUPTIBLE}, + task::{ + MAX_SCHEDULE_TIMEOUT, TASK_FREEZABLE, TASK_INTERRUPTIBLE, TASK_NORMAL, TASK_UNINTERRUPTIBLE, + }, time::Jiffies, types::Opaque, }; @@ -159,6 +161,25 @@ impl CondVar { crate::current!().signal_pending() } + /// Releases the lock and waits for a notification in interruptible and freezable mode. + /// + /// The process is allowed to be frozen during this sleep. No lock should be held when calling + /// this function, and there is a lockdep assertion for this. Freezing a task that holds a lock + /// can trivially deadlock vs another task that needs that lock to complete before it too can + /// hit freezable. + #[must_use = "wait_interruptible_freezable returns if a signal is pending, so the caller must check the return value"] + pub fn wait_interruptible_freezable( + &self, + guard: &mut Guard<'_, T, B>, + ) -> bool { + self.wait_internal( + TASK_INTERRUPTIBLE | TASK_FREEZABLE, + guard, + MAX_SCHEDULE_TIMEOUT, + ); + crate::current!().signal_pending() + } + /// Releases the lock and waits for a notification in interruptible mode. /// /// Atomically releases the given lock (whose ownership is proven by the guard) and puts the diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 07bc22a7645c..ea43a3b8d9c5 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -23,6 +23,8 @@ pub const MAX_SCHEDULE_TIMEOUT: c_long = c_long::MAX; pub const TASK_INTERRUPTIBLE: c_int = bindings::TASK_INTERRUPTIBLE as c_int; /// Bitmask for tasks that are sleeping in an uninterruptible state. pub const TASK_UNINTERRUPTIBLE: c_int = bindings::TASK_UNINTERRUPTIBLE as c_int; +/// Bitmask for tasks that are sleeping in a freezable state. +pub const TASK_FREEZABLE: c_int = bindings::TASK_FREEZABLE as c_int; /// Convenience constant for waking up tasks regardless of whether they are in interruptible or /// uninterruptible sleep. pub const TASK_NORMAL: c_uint = bindings::TASK_NORMAL as c_uint; From f73ca66f0d7f4371d172d6f5b1f9a00e367ba921 Mon Sep 17 00:00:00 2001 From: Mitchell Levy Date: Fri, 7 Mar 2025 15:27:01 -0800 Subject: [PATCH 14/16] rust: lockdep: Use Pin for all LockClassKey usages Reintroduce dynamically-allocated LockClassKeys such that they are automatically (de)registered. Require that all usages of LockClassKeys ensure that they are Pin'd. Currently, only `'static` LockClassKeys are supported, so Pin is redundant. However, it is intended that dynamically-allocated LockClassKeys will eventually be supported, so using Pin from the outset will make that change simpler. Closes: https://github.com/Rust-for-Linux/linux/issues/1102 Suggested-by: Benno Lossin Suggested-by: Boqun Feng Signed-off-by: Mitchell Levy Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20250307232717.1759087-12-boqun.feng@gmail.com --- rust/helpers/helpers.c | 1 + rust/helpers/sync.c | 13 ++++++++ rust/kernel/sync.rs | 57 +++++++++++++++++++++++++++++++-- rust/kernel/sync/condvar.rs | 5 ++- rust/kernel/sync/lock.rs | 4 +-- rust/kernel/sync/lock/global.rs | 5 +-- rust/kernel/sync/poll.rs | 2 +- rust/kernel/workqueue.rs | 2 +- 8 files changed, 77 insertions(+), 12 deletions(-) create mode 100644 rust/helpers/sync.c diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 0640b7e115be..4c1a10a419cf 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -30,6 +30,7 @@ #include "signal.c" #include "slab.c" #include "spinlock.c" +#include "sync.c" #include "task.c" #include "uaccess.c" #include "vmalloc.c" diff --git a/rust/helpers/sync.c b/rust/helpers/sync.c new file mode 100644 index 000000000000..ff7e68b48810 --- /dev/null +++ b/rust/helpers/sync.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_lockdep_register_key(struct lock_class_key *k) +{ + lockdep_register_key(k); +} + +void rust_helper_lockdep_unregister_key(struct lock_class_key *k) +{ + lockdep_unregister_key(k); +} diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 16eab9138b2b..4104bc26471a 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -5,6 +5,8 @@ //! This module contains the kernel APIs related to synchronisation that have been ported or //! wrapped for usage by Rust code in the kernel. +use crate::pin_init; +use crate::prelude::*; use crate::types::Opaque; mod arc; @@ -23,15 +25,64 @@ pub use locked_by::LockedBy; /// Represents a lockdep class. It's a wrapper around C's `lock_class_key`. #[repr(transparent)] -pub struct LockClassKey(Opaque); +#[pin_data(PinnedDrop)] +pub struct LockClassKey { + #[pin] + inner: Opaque, +} // SAFETY: `bindings::lock_class_key` is designed to be used concurrently from multiple threads and // provides its own synchronization. unsafe impl Sync for LockClassKey {} impl LockClassKey { + /// Initializes a dynamically allocated lock class key. In the common case of using a + /// statically allocated lock class key, the static_lock_class! macro should be used instead. + /// + /// # Example + /// ``` + /// # use kernel::{c_str, stack_pin_init}; + /// # use kernel::alloc::KBox; + /// # use kernel::types::ForeignOwnable; + /// # use kernel::sync::{LockClassKey, SpinLock}; + /// + /// let key = KBox::pin_init(LockClassKey::new_dynamic(), GFP_KERNEL)?; + /// let key_ptr = key.into_foreign(); + /// + /// { + /// stack_pin_init!(let num: SpinLock = SpinLock::new( + /// 0, + /// c_str!("my_spinlock"), + /// // SAFETY: `key_ptr` is returned by the above `into_foreign()`, whose + /// // `from_foreign()` has not yet been called. + /// unsafe { > as ForeignOwnable>::borrow(key_ptr) } + /// )); + /// } + /// + /// // SAFETY: We dropped `num`, the only use of the key, so the result of the previous + /// // `borrow` has also been dropped. Thus, it's safe to use from_foreign. + /// unsafe { drop(> as ForeignOwnable>::from_foreign(key_ptr)) }; + /// + /// # Ok::<(), Error>(()) + /// ``` + pub fn new_dynamic() -> impl PinInit { + pin_init!(Self { + // SAFETY: lockdep_register_key expects an uninitialized block of memory + inner <- Opaque::ffi_init(|slot| unsafe { bindings::lockdep_register_key(slot) }) + }) + } + pub(crate) fn as_ptr(&self) -> *mut bindings::lock_class_key { - self.0.get() + self.inner.get() + } +} + +#[pinned_drop] +impl PinnedDrop for LockClassKey { + fn drop(self: Pin<&mut Self>) { + // SAFETY: self.as_ptr was registered with lockdep and self is pinned, so the address + // hasn't changed. Thus, it's safe to pass to unregister. + unsafe { bindings::lockdep_unregister_key(self.as_ptr()) } } } @@ -44,7 +95,7 @@ macro_rules! static_lock_class { // SAFETY: lockdep expects uninitialized memory when it's handed a statically allocated // lock_class_key unsafe { ::core::mem::MaybeUninit::uninit().assume_init() }; - &CLASS + $crate::prelude::Pin::static_ref(&CLASS) }}; } diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index 5c1e546a26c3..fbf68ada582f 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -17,8 +17,7 @@ use crate::{ time::Jiffies, types::Opaque, }; -use core::marker::PhantomPinned; -use core::ptr; +use core::{marker::PhantomPinned, pin::Pin, ptr}; use macros::pin_data; /// Creates a [`CondVar`] initialiser with the given name and a newly-created lock class. @@ -103,7 +102,7 @@ unsafe impl Sync for CondVar {} impl CondVar { /// Constructs a new condvar initialiser. - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit { + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit { pin_init!(Self { _pin: PhantomPinned, // SAFETY: `slot` is valid while the closure is called and both `name` and `key` have diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index f53e87d04cd2..360a10a9216d 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -12,7 +12,7 @@ use crate::{ str::CStr, types::{NotThreadSafe, Opaque, ScopeGuard}, }; -use core::{cell::UnsafeCell, marker::PhantomPinned}; +use core::{cell::UnsafeCell, marker::PhantomPinned, pin::Pin}; use macros::pin_data; pub mod mutex; @@ -129,7 +129,7 @@ unsafe impl Sync for Lock {} impl Lock { /// Constructs a new lock initialiser. - pub fn new(t: T, name: &'static CStr, key: &'static LockClassKey) -> impl PinInit { + pub fn new(t: T, name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit { pin_init!(Self { data: UnsafeCell::new(t), _pin: PhantomPinned, diff --git a/rust/kernel/sync/lock/global.rs b/rust/kernel/sync/lock/global.rs index 480ee724e3cc..d65f94b5caf2 100644 --- a/rust/kernel/sync/lock/global.rs +++ b/rust/kernel/sync/lock/global.rs @@ -13,6 +13,7 @@ use crate::{ use core::{ cell::UnsafeCell, marker::{PhantomData, PhantomPinned}, + pin::Pin, }; /// Trait implemented for marker types for global locks. @@ -26,7 +27,7 @@ pub trait GlobalLockBackend { /// The backend used for this global lock. type Backend: Backend + 'static; /// The class for this global lock. - fn get_lock_class() -> &'static LockClassKey; + fn get_lock_class() -> Pin<&'static LockClassKey>; } /// Type used for global locks. @@ -270,7 +271,7 @@ macro_rules! global_lock { type Item = $valuety; type Backend = $crate::global_lock_inner!(backend $kind); - fn get_lock_class() -> &'static $crate::sync::LockClassKey { + fn get_lock_class() -> Pin<&'static $crate::sync::LockClassKey> { $crate::static_lock_class!() } } diff --git a/rust/kernel/sync/poll.rs b/rust/kernel/sync/poll.rs index d5f17153b424..c4934f82d68b 100644 --- a/rust/kernel/sync/poll.rs +++ b/rust/kernel/sync/poll.rs @@ -89,7 +89,7 @@ pub struct PollCondVar { impl PollCondVar { /// Constructs a new condvar initialiser. - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit { + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit { pin_init!(Self { inner <- CondVar::new(name, key), }) diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index 0cd100d2aefb..6b6f3ad08951 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -369,7 +369,7 @@ unsafe impl Sync for Work {} impl Work { /// Creates a new instance of [`Work`]. #[inline] - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit where T: WorkItem, { From 87886b32d669abc11c7be95ef44099215e4f5788 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 12 Feb 2025 11:36:18 +0100 Subject: [PATCH 15/16] lockdep: Don't disable interrupts on RT in disable_irq_nosync_lockdep.*() disable_irq_nosync_lockdep() disables interrupts with lockdep enabled to avoid false positive reports by lockdep that a certain lock has not been acquired with disabled interrupts. The user of this macros expects that a lock can be acquried without disabling interrupts because the IRQ line triggering the interrupt is disabled. This triggers a warning on PREEMPT_RT because after disable_irq_nosync_lockdep.*() the following spinlock_t now is acquired with disabled interrupts. On PREEMPT_RT there is no difference between spin_lock() and spin_lock_irq() so avoiding disabling interrupts in this case works for the two remaining callers as of today. Don't disable interrupts on PREEMPT_RT in disable_irq_nosync_lockdep.*(). Closes: https://lore.kernel.org/760e34f9-6034-40e0-82a5-ee9becd24438@roeck-us.net Fixes: e8106b941ceab ("[PATCH] lockdep: core, add enable/disable_irq_irqsave/irqrestore() APIs") Reported-by: Guenter Roeck Suggested-by: "Steven Rostedt (Google)" Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20250212103619.2560503-2-bigeasy@linutronix.de --- include/linux/interrupt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 8cd9327e4e78..a1b1be9bf73b 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -448,7 +448,7 @@ irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_disable(); #endif } @@ -456,7 +456,7 @@ static inline void disable_irq_nosync_lockdep(unsigned int irq) static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_save(*flags); #endif } @@ -471,7 +471,7 @@ static inline void disable_irq_lockdep(unsigned int irq) static inline void enable_irq_lockdep(unsigned int irq) { -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_enable(); #endif enable_irq(irq); @@ -479,7 +479,7 @@ static inline void enable_irq_lockdep(unsigned int irq) static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_restore(*flags); #endif enable_irq(irq); From 35e6b537af85d97e0aafd8f2829dfa884a22df20 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 12 Feb 2025 11:36:19 +0100 Subject: [PATCH 16/16] lockdep: Remove disable_irq_lockdep() disable_irq_lockdep() has no users, last one was probabaly removed in 0b7c874348ea1 ("forcedeth: fix unilateral interrupt disabling in netpoll path") Remove disable_irq_lockdep(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20250212103619.2560503-3-bigeasy@linutronix.de --- include/linux/interrupt.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a1b1be9bf73b..c782a74d2a30 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -461,14 +461,6 @@ static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned #endif } -static inline void disable_irq_lockdep(unsigned int irq) -{ - disable_irq(irq); -#ifdef CONFIG_LOCKDEP - local_irq_disable(); -#endif -} - static inline void enable_irq_lockdep(unsigned int irq) { #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)