Skip to content

Commit

Permalink
bpf: Convert ringbuf map to rqspinlock
Browse files Browse the repository at this point in the history
Convert the raw spinlock used by BPF ringbuf to rqspinlock. Currently,
we have an open syzbot report of a potential deadlock. In addition, the
ringbuf can fail to reserve spuriously under contention from NMI
context.

It is potentially attractive to enable unconstrained usage (incl. NMIs)
while ensuring no deadlocks manifest at runtime, perform the conversion
to rqspinlock to achieve this.

This change was benchmarked for BPF ringbuf's multi-producer contention
case on an Intel Sapphire Rapids server, with hyperthreading disabled
and performance governor turned on. 5 warm up runs were done for each
case before obtaining the results.

Before (raw_spinlock_t):

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.440 ± 0.019M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  2.706 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  3.130 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  2.472 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.352 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.813 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.988 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.245 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 2.148 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.190 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.490 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.180 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.201 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.226 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.164 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.874 ± 0.001M/s (drops 0.000 ± 0.000M/s)

After (rqspinlock_t):

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.078 ± 0.019M/s (drops 0.000 ± 0.000M/s) (-3.16%)
rb-libbpf nr_prod 2  2.801 ± 0.014M/s (drops 0.000 ± 0.000M/s) (3.51%)
rb-libbpf nr_prod 3  3.454 ± 0.005M/s (drops 0.000 ± 0.000M/s) (10.35%)
rb-libbpf nr_prod 4  2.567 ± 0.002M/s (drops 0.000 ± 0.000M/s) (3.84%)
rb-libbpf nr_prod 8  2.468 ± 0.001M/s (drops 0.000 ± 0.000M/s) (4.93%)
rb-libbpf nr_prod 12 2.510 ± 0.001M/s (drops 0.000 ± 0.000M/s) (-10.77%)
rb-libbpf nr_prod 16 2.075 ± 0.001M/s (drops 0.000 ± 0.000M/s) (4.38%)
rb-libbpf nr_prod 20 2.640 ± 0.001M/s (drops 0.000 ± 0.000M/s) (17.59%)
rb-libbpf nr_prod 24 2.092 ± 0.001M/s (drops 0.000 ± 0.000M/s) (-2.61%)
rb-libbpf nr_prod 28 2.426 ± 0.005M/s (drops 0.000 ± 0.000M/s) (10.78%)
rb-libbpf nr_prod 32 2.331 ± 0.004M/s (drops 0.000 ± 0.000M/s) (-6.39%)
rb-libbpf nr_prod 36 2.306 ± 0.003M/s (drops 0.000 ± 0.000M/s) (5.78%)
rb-libbpf nr_prod 40 2.178 ± 0.002M/s (drops 0.000 ± 0.000M/s) (-1.04%)
rb-libbpf nr_prod 44 2.293 ± 0.001M/s (drops 0.000 ± 0.000M/s) (3.01%)
rb-libbpf nr_prod 48 2.022 ± 0.001M/s (drops 0.000 ± 0.000M/s) (-6.56%)
rb-libbpf nr_prod 52 1.809 ± 0.001M/s (drops 0.000 ± 0.000M/s) (-3.47%)

There's a fair amount of noise in the benchmark, with numbers on reruns
going up and down by 10%, so all changes are in the range of this
disturbance, and we see no major regressions.

Reported-by: syzbot+850aaf14624dc0c6d366@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/0000000000004aa700061379547e@google.com
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250411101759.4061366-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
  • Loading branch information
Kumar Kartikeya Dwivedi authored and Alexei Starovoitov committed Apr 11, 2025
1 parent 2f41503 commit a650d38
Showing 1 changed file with 7 additions and 10 deletions.
17 changes: 7 additions & 10 deletions kernel/bpf/ringbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <linux/kmemleak.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <asm/rqspinlock.h>

#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)

Expand All @@ -29,7 +30,7 @@ struct bpf_ringbuf {
u64 mask;
struct page **pages;
int nr_pages;
raw_spinlock_t spinlock ____cacheline_aligned_in_smp;
rqspinlock_t spinlock ____cacheline_aligned_in_smp;
/* For user-space producer ring buffers, an atomic_t busy bit is used
* to synchronize access to the ring buffers in the kernel, rather than
* the spinlock that is used for kernel-producer ring buffers. This is
Expand Down Expand Up @@ -173,7 +174,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
if (!rb)
return NULL;

raw_spin_lock_init(&rb->spinlock);
raw_res_spin_lock_init(&rb->spinlock);
atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
Expand Down Expand Up @@ -416,12 +417,8 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)

cons_pos = smp_load_acquire(&rb->consumer_pos);

if (in_nmi()) {
if (!raw_spin_trylock_irqsave(&rb->spinlock, flags))
return NULL;
} else {
raw_spin_lock_irqsave(&rb->spinlock, flags);
}
if (raw_res_spin_lock_irqsave(&rb->spinlock, flags))
return NULL;

pend_pos = rb->pending_pos;
prod_pos = rb->producer_pos;
Expand All @@ -446,7 +443,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
*/
if (new_prod_pos - cons_pos > rb->mask ||
new_prod_pos - pend_pos > rb->mask) {
raw_spin_unlock_irqrestore(&rb->spinlock, flags);
raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}

Expand All @@ -458,7 +455,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
/* pairs with consumer's smp_load_acquire() */
smp_store_release(&rb->producer_pos, new_prod_pos);

raw_spin_unlock_irqrestore(&rb->spinlock, flags);
raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);

return (void *)hdr + BPF_RINGBUF_HDR_SZ;
}
Expand Down

0 comments on commit a650d38

Please sign in to comment.