2025-03-15 21:05:21 -07:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
/*
|
|
|
|
* Resilient Queued Spin Lock
|
|
|
|
*
|
|
|
|
* (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
|
|
|
|
*
|
|
|
|
* Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
|
|
|
|
*/
|
|
|
|
#ifndef __ASM_GENERIC_RQSPINLOCK_H
|
|
|
|
#define __ASM_GENERIC_RQSPINLOCK_H
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
2025-03-15 21:05:23 -07:00
|
|
|
#include <vdso/time64.h>
|
rqspinlock: Add deadlock detection and recovery
While the timeout logic provides guarantees for the waiter's forward
progress, the time until a stalling waiter unblocks can still be long.
The default timeout of 1/4 sec can be excessively long for some use
cases. Additionally, custom timeouts may exacerbate recovery time.
Introduce logic to detect common cases of deadlocks and perform quicker
recovery. This is done by dividing the time from entry into the locking
slow path until the timeout into intervals of 1 ms. Then, after each
interval elapses, deadlock detection is performed, while also polling
the lock word to ensure we can quickly break out of the detection logic
and proceed with lock acquisition.
A 'held_locks' table is maintained per-CPU where the entry at the bottom
denotes a lock being waited for or already taken. Entries coming before
it denote locks that are already held. The current CPU's table can thus
be looked at to detect AA deadlocks. The tables from other CPUs can be
looked at to discover ABBA situations. Finally, when a matching entry
for the lock being taken on the current CPU is found on some other CPU,
a deadlock situation is detected. This function can take a long time,
therefore the lock word is constantly polled in each loop iteration to
ensure we can preempt detection and proceed with lock acquisition, using
the is_lock_released check.
We set 'spin' member of rqspinlock_timeout struct to 0 to trigger
deadlock checks immediately to perform faster recovery.
Note: Extending lock word size by 4 bytes to record owner CPU can allow
faster detection for ABBA. It is typically the owner which participates
in a ABBA situation. However, to keep compatibility with existing lock
words in the kernel (struct qspinlock), and given deadlocks are a rare
event triggered by bugs, we choose to favor compatibility over faster
detection.
The release_held_lock_entry function requires an smp_wmb, while the
release store on unlock will provide the necessary ordering for us. Add
comments to document the subtleties of why this is correct. It is
possible for stores to be reordered still, but in the context of the
deadlock detection algorithm, a release barrier is sufficient and
needn't be stronger for unlock's case.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250316040541.108729-13-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-03-15 21:05:28 -07:00
|
|
|
#include <linux/percpu.h>
|
2025-03-15 21:05:29 -07:00
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
|
|
#include <asm/qspinlock.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct rqspinlock {
|
|
|
|
union {
|
|
|
|
atomic_t val;
|
|
|
|
u32 locked;
|
|
|
|
};
|
|
|
|
};
|
2025-03-15 21:05:21 -07:00
|
|
|
|
bpf: Introduce rqspinlock kfuncs
Introduce four new kfuncs, bpf_res_spin_lock, and bpf_res_spin_unlock,
and their irqsave/irqrestore variants, which wrap the rqspinlock APIs.
bpf_res_spin_lock returns a conditional result, depending on whether the
lock was acquired (NULL is returned when lock acquisition succeeds,
non-NULL upon failure). The memory pointed to by the returned pointer
upon failure can be dereferenced after the NULL check to obtain the
error code.
Instead of using the old bpf_spin_lock type, introduce a new type with
the same layout, and the same alignment, but a different name to avoid
type confusion.
Preemption is disabled upon successful lock acquisition, however IRQs
are not. Special kfuncs can be introduced later to allow disabling IRQs
when taking a spin lock. Resilient locks are safe against AA deadlocks,
hence not disabling IRQs currently does not allow violation of kernel
safety.
__irq_flag annotation is used to accept IRQ flags for the IRQ-variants,
with the same semantics as existing bpf_local_irq_{save, restore}.
These kfuncs will require additional verifier-side support in subsequent
commits, to allow programs to hold multiple locks at the same time.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250316040541.108729-23-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-03-15 21:05:38 -07:00
|
|
|
/* Even though this is same as struct rqspinlock, we need to emit a distinct
|
|
|
|
* type in BTF for BPF programs.
|
|
|
|
*/
|
|
|
|
struct bpf_res_spin_lock {
|
|
|
|
u32 val;
|
|
|
|
};
|
|
|
|
|
2025-03-15 21:05:21 -07:00
|
|
|
struct qspinlock;
|
2025-03-15 21:05:29 -07:00
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
2025-03-15 21:05:21 -07:00
|
|
|
typedef struct qspinlock rqspinlock_t;
|
2025-03-15 21:05:29 -07:00
|
|
|
#else
|
|
|
|
typedef struct rqspinlock rqspinlock_t;
|
|
|
|
#endif
|
2025-03-15 21:05:21 -07:00
|
|
|
|
2025-03-15 21:05:29 -07:00
|
|
|
extern int resilient_tas_spin_lock(rqspinlock_t *lock);
|
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
2025-03-15 21:05:25 -07:00
|
|
|
extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
|
2025-03-15 21:05:29 -07:00
|
|
|
#endif
|
2025-03-15 21:05:21 -07:00
|
|
|
|
2025-03-15 21:05:30 -07:00
|
|
|
#ifndef resilient_virt_spin_lock_enabled
|
|
|
|
static __always_inline bool resilient_virt_spin_lock_enabled(void)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef resilient_virt_spin_lock
|
|
|
|
static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2025-03-15 21:05:23 -07:00
|
|
|
/*
|
|
|
|
* Default timeout for waiting loops is 0.25 seconds
|
|
|
|
*/
|
|
|
|
#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)
|
|
|
|
|
rqspinlock: Add deadlock detection and recovery
While the timeout logic provides guarantees for the waiter's forward
progress, the time until a stalling waiter unblocks can still be long.
The default timeout of 1/4 sec can be excessively long for some use
cases. Additionally, custom timeouts may exacerbate recovery time.
Introduce logic to detect common cases of deadlocks and perform quicker
recovery. This is done by dividing the time from entry into the locking
slow path until the timeout into intervals of 1 ms. Then, after each
interval elapses, deadlock detection is performed, while also polling
the lock word to ensure we can quickly break out of the detection logic
and proceed with lock acquisition.
A 'held_locks' table is maintained per-CPU where the entry at the bottom
denotes a lock being waited for or already taken. Entries coming before
it denote locks that are already held. The current CPU's table can thus
be looked at to detect AA deadlocks. The tables from other CPUs can be
looked at to discover ABBA situations. Finally, when a matching entry
for the lock being taken on the current CPU is found on some other CPU,
a deadlock situation is detected. This function can take a long time,
therefore the lock word is constantly polled in each loop iteration to
ensure we can preempt detection and proceed with lock acquisition, using
the is_lock_released check.
We set 'spin' member of rqspinlock_timeout struct to 0 to trigger
deadlock checks immediately to perform faster recovery.
Note: Extending lock word size by 4 bytes to record owner CPU can allow
faster detection for ABBA. It is typically the owner which participates
in a ABBA situation. However, to keep compatibility with existing lock
words in the kernel (struct qspinlock), and given deadlocks are a rare
event triggered by bugs, we choose to favor compatibility over faster
detection.
The release_held_lock_entry function requires an smp_wmb, while the
release store on unlock will provide the necessary ordering for us. Add
comments to document the subtleties of why this is correct. It is
possible for stores to be reordered still, but in the context of the
deadlock detection algorithm, a release barrier is sufficient and
needn't be stronger for unlock's case.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250316040541.108729-13-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-03-15 21:05:28 -07:00
|
|
|
/*
|
|
|
|
* Choose 31 as it makes rqspinlock_held cacheline-aligned.
|
|
|
|
*/
|
|
|
|
#define RES_NR_HELD 31
|
|
|
|
|
|
|
|
struct rqspinlock_held {
|
|
|
|
int cnt;
|
|
|
|
void *locks[RES_NR_HELD];
|
|
|
|
};
|
|
|
|
|
|
|
|
DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
|
|
|
|
|
|
|
|
static __always_inline void grab_held_lock_entry(void *lock)
|
|
|
|
{
|
|
|
|
int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);
|
|
|
|
|
|
|
|
if (unlikely(cnt > RES_NR_HELD)) {
|
|
|
|
/* Still keep the inc so we decrement later. */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implied compiler barrier in per-CPU operations; otherwise we can have
|
|
|
|
* the compiler reorder inc with write to table, allowing interrupts to
|
|
|
|
* overwrite and erase our write to the table (as on interrupt exit it
|
|
|
|
* will be reset to NULL).
|
|
|
|
*
|
|
|
|
* It is fine for cnt inc to be reordered wrt remote readers though,
|
|
|
|
* they won't observe our entry until the cnt update is visible, that's
|
|
|
|
* all.
|
|
|
|
*/
|
|
|
|
this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We simply don't support out-of-order unlocks, and keep the logic simple here.
|
|
|
|
* The verifier prevents BPF programs from unlocking out-of-order, and the same
|
|
|
|
* holds for in-kernel users.
|
|
|
|
*
|
|
|
|
* It is possible to run into misdetection scenarios of AA deadlocks on the same
|
|
|
|
* CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
|
|
|
|
* out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
|
|
|
|
* logic to preserve right entries in the table would be to walk the array of
|
|
|
|
* held locks and swap and clear out-of-order entries, but that's too
|
|
|
|
* complicated and we don't have a compelling use case for out of order unlocking.
|
|
|
|
*/
|
|
|
|
static __always_inline void release_held_lock_entry(void)
|
|
|
|
{
|
|
|
|
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
|
|
|
|
|
|
|
|
if (unlikely(rqh->cnt > RES_NR_HELD))
|
|
|
|
goto dec;
|
|
|
|
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
|
|
|
|
dec:
|
|
|
|
/*
|
|
|
|
* Reordering of clearing above with inc and its write in
|
|
|
|
* grab_held_lock_entry that came before us (in same acquisition
|
|
|
|
* attempt) is ok, we either see a valid entry or NULL when it's
|
|
|
|
* visible.
|
|
|
|
*
|
|
|
|
* But this helper is invoked when we unwind upon failing to acquire the
|
|
|
|
* lock. Unlike the unlock path which constitutes a release store after
|
|
|
|
* we clear the entry, we need to emit a write barrier here. Otherwise,
|
|
|
|
* we may have a situation as follows:
|
|
|
|
*
|
|
|
|
* <error> for lock B
|
|
|
|
* release_held_lock_entry
|
|
|
|
*
|
|
|
|
* try_cmpxchg_acquire for lock A
|
|
|
|
* grab_held_lock_entry
|
|
|
|
*
|
|
|
|
* Lack of any ordering means reordering may occur such that dec, inc
|
|
|
|
* are done before entry is overwritten. This permits a remote lock
|
|
|
|
* holder of lock B (which this CPU failed to acquire) to now observe it
|
|
|
|
* as being attempted on this CPU, and may lead to misdetection (if this
|
|
|
|
* CPU holds a lock it is attempting to acquire, leading to false ABBA
|
|
|
|
* diagnosis).
|
|
|
|
*
|
|
|
|
* In case of unlock, we will always do a release on the lock word after
|
|
|
|
* releasing the entry, ensuring that other CPUs cannot hold the lock
|
|
|
|
* (and make conclusions about deadlocks) until the entry has been
|
|
|
|
* cleared on the local CPU, preventing any anomalies. Reordering is
|
|
|
|
* still possible there, but a remote CPU cannot observe a lock in our
|
|
|
|
* table which it is already holding, since visibility entails our
|
|
|
|
* release store for the said lock has not retired.
|
|
|
|
*
|
|
|
|
* In theory we don't have a problem if the dec and WRITE_ONCE above get
|
|
|
|
* reordered with each other, we either notice an empty NULL entry on
|
|
|
|
* top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
|
|
|
|
* cannot be observed (if dec precedes WRITE_ONCE).
|
|
|
|
*
|
|
|
|
* Emit the write barrier _before_ the dec, this permits dec-inc
|
|
|
|
* reordering but that is harmless as we'd have new entry set to NULL
|
|
|
|
* already, i.e. they cannot precede the NULL store above.
|
|
|
|
*/
|
|
|
|
smp_wmb();
|
|
|
|
this_cpu_dec(rqspinlock_held_locks.cnt);
|
|
|
|
}
|
|
|
|
|
2025-03-15 21:05:32 -07:00
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
|
|
|
|
|
|
/**
|
|
|
|
* res_spin_lock - acquire a queued spinlock
|
|
|
|
* @lock: Pointer to queued spinlock structure
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* * 0 - Lock was acquired successfully.
|
|
|
|
* * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
|
|
|
|
* * -ETIMEDOUT - Lock acquisition failed because of timeout.
|
|
|
|
*/
|
|
|
|
static __always_inline int res_spin_lock(rqspinlock_t *lock)
|
|
|
|
{
|
|
|
|
int val = 0;
|
|
|
|
|
|
|
|
if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
|
|
|
|
grab_held_lock_entry(lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return resilient_queued_spin_lock_slowpath(lock, val);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
#define res_spin_lock(lock) resilient_tas_spin_lock(lock)
|
|
|
|
|
|
|
|
#endif /* CONFIG_QUEUED_SPINLOCKS */
|
|
|
|
|
|
|
|
static __always_inline void res_spin_unlock(rqspinlock_t *lock)
|
|
|
|
{
|
|
|
|
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
|
|
|
|
|
|
|
|
if (unlikely(rqh->cnt > RES_NR_HELD))
|
|
|
|
goto unlock;
|
|
|
|
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
|
|
|
|
unlock:
|
|
|
|
/*
|
|
|
|
* Release barrier, ensures correct ordering. See release_held_lock_entry
|
|
|
|
* for details. Perform release store instead of queued_spin_unlock,
|
|
|
|
* since we use this function for test-and-set fallback as well. When we
|
|
|
|
* have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
|
|
|
|
*
|
|
|
|
* Like release_held_lock_entry, we can do the release before the dec.
|
|
|
|
* We simply care about not seeing the 'lock' in our table from a remote
|
|
|
|
* CPU once the lock has been released, which doesn't rely on the dec.
|
|
|
|
*
|
|
|
|
* Unlike smp_wmb(), release is not a two way fence, hence it is
|
|
|
|
* possible for a inc to move up and reorder with our clearing of the
|
|
|
|
* entry. This isn't a problem however, as for a misdiagnosis of ABBA,
|
|
|
|
* the remote CPU needs to hold this lock, which won't be released until
|
|
|
|
* the store below is done, which would ensure the entry is overwritten
|
|
|
|
* to NULL, etc.
|
|
|
|
*/
|
|
|
|
smp_store_release(&lock->locked, 0);
|
|
|
|
this_cpu_dec(rqspinlock_held_locks.cnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
|
|
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
|
|
|
|
#else
|
|
|
|
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define raw_res_spin_lock(lock) \
|
|
|
|
({ \
|
|
|
|
int __ret; \
|
|
|
|
preempt_disable(); \
|
|
|
|
__ret = res_spin_lock(lock); \
|
|
|
|
if (__ret) \
|
|
|
|
preempt_enable(); \
|
|
|
|
__ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
|
|
|
|
|
|
|
|
#define raw_res_spin_lock_irqsave(lock, flags) \
|
|
|
|
({ \
|
|
|
|
int __ret; \
|
|
|
|
local_irq_save(flags); \
|
|
|
|
__ret = raw_res_spin_lock(lock); \
|
|
|
|
if (__ret) \
|
|
|
|
local_irq_restore(flags); \
|
|
|
|
__ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
|
|
|
|
|
2025-03-15 21:05:21 -07:00
|
|
|
#endif /* __ASM_GENERIC_RQSPINLOCK_H */
|