Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2024-01-05

We've added 40 non-merge commits during the last 2 day(s) which contain
a total of 73 files changed, 1526 insertions(+), 951 deletions(-).

The main changes are:

1) Fix a memory leak when streaming AF_UNIX sockets were inserted
   into multiple sockmap slots/maps, from John Fastabend.

2) Fix gotol in s390 BPF JIT with large offsets, from Ilya Leoshkevich.

3) Fix reattachment branch in bpf_tracing_prog_attach() and reject
   the request if there is no valid attach_btf, from Jiri Olsa.

4) Remove deprecated bpfilter kernel leftovers given the project
   is developed in user space (https://github.com/facebook/bpfilter),
   from Quentin Deslandes.

5) Relax tracing BPF program recursive attach rules given right now
   it is not possible to create tracing program call cycles,
   from Dmitrii Dolgov.

6) Fix excessive memory consumption for the bpf_global_percpu_ma
   for systems with a large number of CPUs, from Yonghong Song.

7) Small x86 BPF JIT cleanup to reuse emit_nops instead of open-coding
   memcpy of x86_nops, from Leon Hwang.

8) Follow-up for libbpf to support __arg_ctx global function argument tag
   semantics to complement the merged kernel side, from Andrii Nakryiko.

9) Introduce "volatile compare" macros for BPF selftests in order
   to make the latter more robust against compiler optimization,
   from Alexei Starovoitov.

10) Small simplification in verifier's size checking of helper accesses
    along with additional selftests, from Andrei Matei.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (40 commits)
  selftests/bpf: Test re-attachment fix for bpf_tracing_prog_attach
  bpf: Fix re-attachment branch in bpf_tracing_prog_attach
  selftests/bpf: Add test for recursive attachment of tracing progs
  bpf: Relax tracing prog recursive attach rules
  bpf, x86: Use emit_nops to replace memcpy x86_nops
  selftests/bpf: Test gotol with large offsets
  selftests/bpf: Double the size of test_loader log
  s390/bpf: Fix gotol with large offsets
  bpfilter: remove bpfilter
  bpf: Remove unnecessary cpu == 0 check in memalloc
  selftests/bpf: add __arg_ctx BTF rewrite test
  selftests/bpf: add arg:ctx cases to test_global_funcs tests
  libbpf: implement __arg_ctx fallback logic
  libbpf: move BTF loading step after relocation step
  libbpf: move exception callbacks assignment logic into relocation step
  libbpf: use stable map placeholder FDs
  libbpf: don't rely on map->fd as an indicator of map being created
  libbpf: use explicit map reuse flag to skip map creation steps
  libbpf: make uniform use of btf__fd() accessor inside libbpf
  selftests/bpf: Add a selftest with > 512-byte percpu allocation size
  ...
====================

Link: https://lore.kernel.org/r/20240105170105.21070-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Jan 6, 2024
2 parents 795fd93 + 5fe4ee6 commit 8158a50
Show file tree
Hide file tree
Showing 73 changed files with 1,526 additions and 951 deletions.
1 change: 0 additions & 1 deletion arch/loongarch/configs/loongson3_defconfig
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,6 @@ CONFIG_BRIDGE_EBT_T_NAT=m
CONFIG_BRIDGE_EBT_ARP=m
CONFIG_BRIDGE_EBT_IP=m
CONFIG_BRIDGE_EBT_IP6=m
CONFIG_BPFILTER=y
CONFIG_IP_SCTP=m
CONFIG_RDS=y
CONFIG_L2TP=m
Expand Down
2 changes: 1 addition & 1 deletion arch/s390/net/bpf_jit_comp.c
Original file line number Diff line number Diff line change
Expand Up @@ -779,7 +779,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
int i, bool extra_pass, u32 stack_depth)
{
struct bpf_insn *insn = &fp->insnsi[i];
s16 branch_oc_off = insn->off;
s32 branch_oc_off = insn->off;
u32 dst_reg = insn->dst_reg;
u32 src_reg = insn->src_reg;
int last, insn_count = 1;
Expand Down
47 changes: 22 additions & 25 deletions arch/x86/net/bpf_jit_comp.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,25 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
*pprog = prog;
}

static void emit_nops(u8 **pprog, int len)
{
u8 *prog = *pprog;
int i, noplen;

while (len > 0) {
noplen = len;

if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;

for (i = 0; i < noplen; i++)
EMIT1(x86_nops[noplen][i]);
len -= noplen;
}

*pprog = prog;
}

/*
* Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
* in arch/x86/kernel/alternative.c
Expand Down Expand Up @@ -385,8 +404,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
emit_nops(&prog, X86_PATCH_SIZE);
if (!ebpf_from_cbpf) {
if (tail_call_reachable && !is_subprog)
/* When it's the entry of the whole tailcall context,
Expand Down Expand Up @@ -692,8 +710,7 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));

memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
emit_nops(&prog, X86_PATCH_SIZE);

/* out: */
ctx->tail_call_direct_label = prog - start;
Expand Down Expand Up @@ -1055,25 +1072,6 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
}
}

static void emit_nops(u8 **pprog, int len)
{
u8 *prog = *pprog;
int i, noplen;

while (len > 0) {
noplen = len;

if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;

for (i = 0; i < noplen; i++)
EMIT1(x86_nops[noplen][i]);
len -= noplen;
}

*pprog = prog;
}

/* emit the 3-byte VEX prefix
*
* r: same as rex.r, extra bit for ModRM reg field
Expand Down Expand Up @@ -2700,8 +2698,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
im->ip_after_call = image + (prog - (u8 *)rw_image);
memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
emit_nops(&prog, X86_PATCH_SIZE);
}

if (fmod_ret->nr_links) {
Expand Down
1 change: 1 addition & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1449,6 +1449,7 @@ struct bpf_prog_aux {
bool dev_bound; /* Program is bound to the netdev. */
bool offload_requested; /* Program is bound and offloaded to the netdev. */
bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
bool attach_tracing_prog; /* true if tracing another tracing program */
bool func_proto_unreliable;
bool sleepable;
bool tail_call_reachable;
Expand Down
8 changes: 8 additions & 0 deletions include/linux/bpf_mem_alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ struct bpf_mem_caches;
struct bpf_mem_alloc {
struct bpf_mem_caches __percpu *caches;
struct bpf_mem_cache __percpu *cache;
struct obj_cgroup *objcg;
bool percpu;
struct work_struct work;
};
Expand All @@ -21,8 +22,15 @@ struct bpf_mem_alloc {
* 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
* Alloc and free are done with bpf_mem_{alloc,free}() and the size of
* the returned object is given by the size argument of bpf_mem_alloc().
* If percpu equals true, error will be returned in order to avoid
* large memory consumption and the below bpf_mem_alloc_percpu_unit_init()
* should be used to do on-demand per-cpu allocation for each size.
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
/* Initialize a non-fix-size percpu memory allocator */
int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg);
/* The percpu allocation with a specific unit size. */
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);

/* kmalloc/kfree equivalent: */
Expand Down
24 changes: 0 additions & 24 deletions include/linux/bpfilter.h

This file was deleted.

5 changes: 5 additions & 0 deletions include/linux/skmsg.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ struct sk_psock {
void (*saved_close)(struct sock *sk, long timeout);
void (*saved_write_space)(struct sock *sk);
void (*saved_data_ready)(struct sock *sk);
/* psock_update_sk_prot may be called with restore=false many times
* so the handler must be safe for this case. It will be called
* exactly once with restore=true when the psock is being destroyed
* and psock refcnt is zero, but before an RCU grace period.
*/
int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock,
bool restore);
struct proto *sk_proto;
Expand Down
21 changes: 0 additions & 21 deletions include/uapi/linux/bpfilter.h

This file was deleted.

93 changes: 81 additions & 12 deletions kernel/bpf/memalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ struct bpf_mem_caches {
struct bpf_mem_cache cache[NUM_CACHES];
};

static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};

static struct llist_node notrace *__llist_del_first(struct llist_head *head)
{
struct llist_node *entry, *next;
Expand Down Expand Up @@ -462,11 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
* consume ~ 11 Kbyte per cpu.
* Typical case will be between 11K and 116K closer to 11K.
* bpf progs can and should share bpf_mem_cache when possible.
*
* Percpu allocation is typically rare. To avoid potential unnecessary large
* memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
*/
static void init_refill_work(struct bpf_mem_cache *c)
{
init_irq_work(&c->refill_work, bpf_mem_refill);
if (c->unit_size <= 256) {
if (c->percpu_size) {
c->low_watermark = 1;
c->high_watermark = 3;
} else if (c->unit_size <= 256) {
c->low_watermark = 32;
c->high_watermark = 96;
} else {
Expand All @@ -483,11 +491,16 @@ static void init_refill_work(struct bpf_mem_cache *c)

static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
{
/* To avoid consuming memory assume that 1st run of bpf
* prog won't be doing more than 4 map_update_elem from
* irq disabled region
int cnt = 1;

/* To avoid consuming memory, for non-percpu allocation, assume that
* 1st run of bpf prog won't be doing more than 4 map_update_elem from
* irq disabled region if unit size is less than or equal to 256.
* For all other cases, let us just do one allocation.
*/
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
if (!c->percpu_size && c->unit_size <= 256)
cnt = 4;
alloc_bulk(c, cnt, cpu_to_node(cpu), false);
}

/* When size != 0 bpf_mem_cache for each cpu.
Expand All @@ -499,12 +512,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
struct bpf_mem_caches *cc, __percpu *pcc;
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;

if (percpu && size == 0)
return -EINVAL;

/* room for llist_node and per-cpu pointer */
if (percpu)
percpu_size = LLIST_NODE_SZ + sizeof(void *);
Expand All @@ -523,6 +538,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
if (memcg_bpf_enabled())
objcg = get_obj_cgroup_from_current();
#endif
ma->objcg = objcg;

for_each_possible_cpu(cpu) {
c = per_cpu_ptr(pc, cpu);
c->unit_size = unit_size;
Expand All @@ -542,6 +559,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
#ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current();
#endif
ma->objcg = objcg;
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(pcc, cpu);
for (i = 0; i < NUM_CACHES; i++) {
Expand All @@ -560,6 +578,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
return 0;
}

int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
{
struct bpf_mem_caches __percpu *pcc;

pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;

ma->caches = pcc;
ma->objcg = objcg;
ma->percpu = true;
return 0;
}

int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
{
struct bpf_mem_caches *cc, __percpu *pcc;
int cpu, i, unit_size, percpu_size;
struct obj_cgroup *objcg;
struct bpf_mem_cache *c;

i = bpf_mem_cache_idx(size);
if (i < 0)
return -EINVAL;

/* room for llist_node and per-cpu pointer */
percpu_size = LLIST_NODE_SZ + sizeof(void *);

unit_size = sizes[i];
objcg = ma->objcg;
pcc = ma->caches;

for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(pcc, cpu);
c = &cc->cache[i];
if (c->unit_size)
break;

c->unit_size = unit_size;
c->objcg = objcg;
c->percpu_size = percpu_size;
c->tgt = c;

init_refill_work(c);
prefill_mem_cache(c, cpu);
}

return 0;
}

static void drain_mem_cache(struct bpf_mem_cache *c)
{
bool percpu = !!c->percpu_size;
Expand Down Expand Up @@ -691,9 +759,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
/* objcg is the same across cpus */
if (c->objcg)
obj_cgroup_put(c->objcg);
if (ma->objcg)
obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
if (ma->caches) {
Expand All @@ -709,8 +776,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
if (c->objcg)
obj_cgroup_put(c->objcg);
if (ma->objcg)
obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
}
Expand Down Expand Up @@ -833,7 +900,9 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
if (!size)
return NULL;

idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
if (!ma->percpu)
size += LLIST_NODE_SZ;
idx = bpf_mem_cache_idx(size);
if (idx < 0)
return NULL;

Expand Down
Loading

0 comments on commit 8158a50

Please sign in to comment.