Skip to content

Commit

Permalink
Merge tag 'bpf_res_spin_lock' of git://git.kernel.org/pub/scm/linux/k…
Browse files Browse the repository at this point in the history
…ernel/git/bpf/bpf-next

Pull bpf relisient spinlock support from Alexei Starovoitov:
 "This patch set introduces Resilient Queued Spin Lock (or rqspinlock
  with res_spin_lock() and res_spin_unlock() APIs).

  This is a qspinlock variant which recovers the kernel from a stalled
  state when the lock acquisition path cannot make forward progress.
  This can occur when a lock acquisition attempt enters a deadlock
  situation (e.g. AA, or ABBA), or more generally, when the owner of the
  lock (which we’re trying to acquire) isn’t making forward progress.
  Deadlock detection is the main mechanism used to provide instant
  recovery, with the timeout mechanism acting as a final line of
  defense. Detection is triggered immediately when beginning the waiting
  loop of a lock slow path.

  Additionally, BPF programs attached to different parts of the kernel
  can introduce new control flow into the kernel, which increases the
  likelihood of deadlocks in code not written to handle reentrancy.
  There have been multiple syzbot reports surfacing deadlocks in
  internal kernel code due to the diverse ways in which BPF programs can
  be attached to different parts of the kernel. By switching the BPF
  subsystem’s lock usage to rqspinlock, all of these issues are
  mitigated at runtime.

  This spin lock implementation allows BPF maps to become safer and
  remove mechanisms that have fallen short in assuring safety when
  nesting programs in arbitrary ways in the same context or across
  different contexts.

  We run benchmarks that stress locking scalability and perform
  comparison against the baseline (qspinlock). For the rqspinlock case,
  we replace the default qspinlock with it in the kernel, such that all
  spin locks in the kernel use the rqspinlock slow path. As such,
  benchmarks that stress kernel spin locks end up exercising rqspinlock.

  More details in the cover letter in commit 6ffb901 ("Merge branch
  'resilient-queued-spin-lock'")"

* tag 'bpf_res_spin_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (24 commits)
  selftests/bpf: Add tests for rqspinlock
  bpf: Maintain FIFO property for rqspinlock unlock
  bpf: Implement verifier support for rqspinlock
  bpf: Introduce rqspinlock kfuncs
  bpf: Convert lpm_trie.c to rqspinlock
  bpf: Convert percpu_freelist.c to rqspinlock
  bpf: Convert hashtab.c to rqspinlock
  rqspinlock: Add locktorture support
  rqspinlock: Add entry to Makefile, MAINTAINERS
  rqspinlock: Add macros for rqspinlock usage
  rqspinlock: Add basic support for CONFIG_PARAVIRT
  rqspinlock: Add a test-and-set fallback
  rqspinlock: Add deadlock detection and recovery
  rqspinlock: Protect waiters in trylock fallback from stalls
  rqspinlock: Protect waiters in queue from stalls
  rqspinlock: Protect pending bit owners from stalls
  rqspinlock: Hardcode cond_acquire loops for arm64
  rqspinlock: Add support for timeouts
  rqspinlock: Drop PV and virtualization support
  rqspinlock: Add rqspinlock.h header
  ...
  • Loading branch information
Linus Torvalds committed Mar 30, 2025
2 parents fa593d0 + 6ffb901 commit 494e7fe
Show file tree
Hide file tree
Showing 27 changed files with 2,312 additions and 417 deletions.
2 changes: 2 additions & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -4361,6 +4361,8 @@ F: include/uapi/linux/filter.h
F: kernel/bpf/
F: kernel/trace/bpf_trace.c
F: lib/buildid.c
F: arch/*/include/asm/rqspinlock.h
F: include/asm-generic/rqspinlock.h
F: lib/test_bpf.c
F: net/bpf/
F: net/core/filter.c
Expand Down
93 changes: 93 additions & 0 deletions arch/arm64/include/asm/rqspinlock.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_RQSPINLOCK_H
#define _ASM_RQSPINLOCK_H

#include <asm/barrier.h>

/*
* Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom
* version based on [0]. In rqspinlock code, our conditional expression involves
* checking the value _and_ additionally a timeout. However, on arm64, the
* WFE-based implementation may never spin again if no stores occur to the
* locked byte in the lock word. As such, we may be stuck forever if
* event-stream based unblocking is not available on the platform for WFE spin
* loops (arch_timer_evtstrm_available).
*
* Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this
* copy-paste.
*
* While we rely on the implementation to amortize the cost of sampling
* cond_expr for us, it will not happen when event stream support is
* unavailable, time_expr check is amortized. This is not the common case, and
* it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns
* comparison, hence just let it be. In case of event-stream, the loop is woken
* up at microsecond granularity.
*
* [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com
*/

#ifndef smp_cond_load_acquire_timewait

#define smp_cond_time_check_count 200

#define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \
time_limit_ns) ({ \
typeof(ptr) __PTR = (ptr); \
__unqual_scalar_typeof(*ptr) VAL; \
unsigned int __count = 0; \
for (;;) { \
VAL = READ_ONCE(*__PTR); \
if (cond_expr) \
break; \
cpu_relax(); \
if (__count++ < smp_cond_time_check_count) \
continue; \
if ((time_expr_ns) >= (time_limit_ns)) \
break; \
__count = 0; \
} \
(typeof(*ptr))VAL; \
})

#define __smp_cond_load_acquire_timewait(ptr, cond_expr, \
time_expr_ns, time_limit_ns) \
({ \
typeof(ptr) __PTR = (ptr); \
__unqual_scalar_typeof(*ptr) VAL; \
for (;;) { \
VAL = smp_load_acquire(__PTR); \
if (cond_expr) \
break; \
__cmpwait_relaxed(__PTR, VAL); \
if ((time_expr_ns) >= (time_limit_ns)) \
break; \
} \
(typeof(*ptr))VAL; \
})

#define smp_cond_load_acquire_timewait(ptr, cond_expr, \
time_expr_ns, time_limit_ns) \
({ \
__unqual_scalar_typeof(*ptr) _val; \
int __wfe = arch_timer_evtstrm_available(); \
\
if (likely(__wfe)) { \
_val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \
time_expr_ns, \
time_limit_ns); \
} else { \
_val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \
time_expr_ns, \
time_limit_ns); \
smp_acquire__after_ctrl_dep(); \
} \
(typeof(*ptr))_val; \
})

#endif

#define res_smp_cond_load_acquire_timewait(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1)

#include <asm-generic/rqspinlock.h>

#endif /* _ASM_RQSPINLOCK_H */
33 changes: 33 additions & 0 deletions arch/x86/include/asm/rqspinlock.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_RQSPINLOCK_H
#define _ASM_X86_RQSPINLOCK_H

#include <asm/paravirt.h>

#ifdef CONFIG_PARAVIRT
DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);

#define resilient_virt_spin_lock_enabled resilient_virt_spin_lock_enabled
static __always_inline bool resilient_virt_spin_lock_enabled(void)
{
return static_branch_likely(&virt_spin_lock_key);
}

#ifdef CONFIG_QUEUED_SPINLOCKS
typedef struct qspinlock rqspinlock_t;
#else
typedef struct rqspinlock rqspinlock_t;
#endif
extern int resilient_tas_spin_lock(rqspinlock_t *lock);

#define resilient_virt_spin_lock resilient_virt_spin_lock
static inline int resilient_virt_spin_lock(rqspinlock_t *lock)
{
return resilient_tas_spin_lock(lock);
}

#endif /* CONFIG_PARAVIRT */

#include <asm-generic/rqspinlock.h>

#endif /* _ASM_X86_RQSPINLOCK_H */
1 change: 1 addition & 0 deletions include/asm-generic/Kbuild
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ mandatory-y += pci.h
mandatory-y += percpu.h
mandatory-y += pgalloc.h
mandatory-y += preempt.h
mandatory-y += rqspinlock.h
mandatory-y += runtime-const.h
mandatory-y += rwonce.h
mandatory-y += sections.h
Expand Down
6 changes: 6 additions & 0 deletions include/asm-generic/mcs_spinlock.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
#ifndef __ASM_MCS_SPINLOCK_H
#define __ASM_MCS_SPINLOCK_H

struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */
int count; /* nesting count, see qspinlock.c */
};

/*
* Architectures can define their own:
*
Expand Down
Loading

0 comments on commit 494e7fe

Please sign in to comment.