Skip to content

Commit

Permalink
Merge branch 'bpf, mm: introduce cgroup.memory=nobpf'
Browse files Browse the repository at this point in the history
Yafang Shao says:

====================

The bpf memory accouting has some known problems in contianer
environment,

- The container memory usage is not consistent if there's pinned bpf
  program
  After the container restart, the leftover bpf programs won't account
  to the new generation, so the memory usage of the container is not
  consistent. This issue can be resolved by introducing selectable
  memcg, but we don't have an agreement on the solution yet. See also
  the discussions at https://lwn.net/Articles/905150/ .

- The leftover non-preallocated bpf map can't be limited
  The leftover bpf map will be reparented, and thus it will be limited by
  the parent, rather than the container itself. Furthermore, if the
  parent is destroyed, it be will limited by its parent's parent, and so
  on. It can also be resolved by introducing selectable memcg.

- The memory dynamically allocated in bpf prog is charged into root memcg
  only
  Nowdays the bpf prog can dynamically allocate memory, for example via
  bpf_obj_new(), but it only allocate from the global bpf_mem_alloc
  pool, so it will charge into root memcg only. That needs to be
  addressed by a new proposal.

So let's give the container user an option to disable bpf memory accouting.

The idea of "cgroup.memory=nobpf" is originally by Tejun[1].

[1]. https://lwn.net/ml/linux-mm/YxjOawzlgE458ezL@slm.duckdns.org/

Changes,
v1->v2:
- squash patches (Roman)
- commit log improvement in patch #2. (Johannes)
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
  • Loading branch information
Alexei Starovoitov committed Feb 11, 2023
2 parents 7e2a9eb + bf39650 commit ab86cf3
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 11 deletions.
1 change: 1 addition & 0 deletions Documentation/admin-guide/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@
Format: <string>
nosocket -- Disable socket memory accounting.
nokmem -- Disable kernel memory accounting.
nobpf -- Disable BPF memory accounting.

checkreqprot= [SELINUX] Set initial checkreqprot flag value.
Format: { "0" | "1" }
Expand Down
16 changes: 16 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/static_call.h>
#include <linux/memcontrol.h>

struct bpf_verifier_env;
struct bpf_verifier_log;
Expand Down Expand Up @@ -1886,6 +1887,8 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node);
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
gfp_t flags);
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags);
#else
Expand All @@ -1902,6 +1905,12 @@ bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
return kzalloc(size, flags);
}

static inline void *
bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags)
{
return kvcalloc(n, size, flags);
}

static inline void __percpu *
bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
gfp_t flags)
Expand Down Expand Up @@ -2925,4 +2934,11 @@ static inline bool type_is_alloc(u32 type)
return type & MEM_ALLOC;
}

static inline gfp_t bpf_memcg_flags(gfp_t flags)
{
if (memcg_bpf_enabled())
return flags | __GFP_ACCOUNT;
return flags;
}

#endif /* _LINUX_BPF_H */
11 changes: 11 additions & 0 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -1754,6 +1754,12 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page);
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_bpf_enabled_key;
static inline bool memcg_bpf_enabled(void)
{
return static_branch_likely(&memcg_bpf_enabled_key);
}

extern struct static_key_false memcg_kmem_enabled_key;

static inline bool memcg_kmem_enabled(void)
Expand Down Expand Up @@ -1832,6 +1838,11 @@ static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
return NULL;
}

static inline bool memcg_bpf_enabled(void)
{
return false;
}

static inline bool memcg_kmem_enabled(void)
{
return false;
Expand Down
4 changes: 2 additions & 2 deletions kernel/bpf/bpf_local_storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);

smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
nbuckets, GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
bpf_map_area_free(smap);
return ERR_PTR(-ENOMEM);
Expand Down
13 changes: 7 additions & 6 deletions kernel/bpf/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>

#include <asm/barrier.h>
#include <asm/unaligned.h>
Expand Down Expand Up @@ -87,7 +88,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns

struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;

Expand All @@ -96,12 +97,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
if (fp == NULL)
return NULL;

aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (aux == NULL) {
vfree(fp);
return NULL;
}
fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (!fp->active) {
vfree(fp);
kfree(aux);
Expand All @@ -126,7 +127,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *prog;
int cpu;

Expand Down Expand Up @@ -159,7 +160,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)

prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
sizeof(*prog->aux->jited_linfo),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
if (!prog->aux->jited_linfo)
return -ENOMEM;

Expand Down Expand Up @@ -234,7 +235,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *fp;
u32 pages;

Expand Down
3 changes: 2 additions & 1 deletion kernel/bpf/memalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
unit_size = size;

#ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current();
if (memcg_bpf_enabled())
objcg = get_obj_cgroup_from_current();
#endif
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(pc, cpu);
Expand Down
20 changes: 18 additions & 2 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/

const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
unsigned int flags = 0;
unsigned long align = 1;
void *area;
Expand Down Expand Up @@ -418,7 +418,8 @@ static void bpf_map_save_memcg(struct bpf_map *map)
* So we have to check map->objcg for being NULL each time it's
* being used.
*/
map->objcg = get_obj_cgroup_from_current();
if (memcg_bpf_enabled())
map->objcg = get_obj_cgroup_from_current();
}

static void bpf_map_release_memcg(struct bpf_map *map)
Expand Down Expand Up @@ -464,6 +465,21 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
return ptr;
}

void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;

memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);

return ptr;
}

void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
Expand Down
18 changes: 18 additions & 0 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem __ro_after_init;

/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;

#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
Expand Down Expand Up @@ -347,6 +350,9 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);

DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
EXPORT_SYMBOL(memcg_bpf_enabled_key);
#endif

/**
Expand Down Expand Up @@ -5357,6 +5363,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);

#if defined(CONFIG_MEMCG_KMEM)
if (!cgroup_memory_nobpf)
static_branch_inc(&memcg_bpf_enabled_key);
#endif

return &memcg->css;
}

Expand Down Expand Up @@ -5441,6 +5452,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);

#if defined(CONFIG_MEMCG_KMEM)
if (!cgroup_memory_nobpf)
static_branch_dec(&memcg_bpf_enabled_key);
#endif

vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
Expand Down Expand Up @@ -7269,6 +7285,8 @@ static int __init cgroup_memory(char *s)
cgroup_memory_nosocket = true;
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
if (!strcmp(token, "nobpf"))
cgroup_memory_nobpf = true;
}
return 1;
}
Expand Down

0 comments on commit ab86cf3

Please sign in to comment.