Skip to content

Commit

Permalink
bpf: Make cgroup storages shared between programs on the same cgroup
Browse files Browse the repository at this point in the history
This change comes in several parts:

One, the restriction that the CGROUP_STORAGE map can only be used
by one program is removed. This results in the removal of the field
'aux' in struct bpf_cgroup_storage_map, and removal of relevant
code associated with the field, and removal of now-noop functions
bpf_free_cgroup_storage and bpf_cgroup_storage_release.

Second, we permit a key of type u64 as the key to the map.
Providing such a key type indicates that the map should ignore
attach type when comparing map keys. However, for simplicity newly
linked storage will still have the attach type at link time in
its key struct. cgroup_storage_check_btf is adapted to accept
u64 as the type of the key.

Third, because the storages are now shared, the storages cannot
be unconditionally freed on program detach. There could be two
ways to solve this issue:
* A. Reference count the usage of the storages, and free when the
     last program is detached.
* B. Free only when the storage is impossible to be referred to
     again, i.e. when either the cgroup_bpf it is attached to, or
     the map itself, is freed.
Option A has the side effect that, when the user detach and
reattach a program, whether the program gets a fresh storage
depends on whether there is another program attached using that
storage. This could trigger races if the user is multi-threaded,
and since nondeterminism in data races is evil, go with option B.

The both the map and the cgroup_bpf now tracks their associated
storages, and the storage unlink and free are removed from
cgroup_bpf_detach and added to cgroup_bpf_release and
cgroup_storage_map_free. The latter also new holds the cgroup_mutex
to prevent any races with the former.

Fourth, on attach, we reuse the old storage if the key already
exists in the map, via cgroup_storage_lookup. If the storage
does not exist yet, we create a new one, and publish it at the
last step in the attach process. This does not create a race
condition because for the whole attach the cgroup_mutex is held.
We keep track of an array of new storages that was allocated
and if the process fails only the new storages would get freed.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/d5401c6106728a00890401190db40020a1f84ff1.1595565795.git.zhuyifei@google.com
  • Loading branch information
YiFei Zhu authored and Alexei Starovoitov committed Jul 26, 2020
1 parent 9e5bd1f commit 7d9c342
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 143 deletions.
12 changes: 8 additions & 4 deletions include/linux/bpf-cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ struct bpf_cgroup_storage {
};
struct bpf_cgroup_storage_map *map;
struct bpf_cgroup_storage_key key;
struct list_head list;
struct list_head list_map;
struct list_head list_cg;
struct rb_node node;
struct rcu_head rcu;
};
Expand Down Expand Up @@ -78,6 +79,9 @@ struct cgroup_bpf {
struct list_head progs[MAX_BPF_ATTACH_TYPE];
u32 flags[MAX_BPF_ATTACH_TYPE];

/* list of cgroup shared storages */
struct list_head storages;

/* temp storage for effective prog array used by prog_attach/detach */
struct bpf_prog_array *inactive;

Expand Down Expand Up @@ -161,6 +165,9 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
}

struct bpf_cgroup_storage *
cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
void *key, bool locked);
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
enum bpf_cgroup_storage_type stype);
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
Expand All @@ -169,7 +176,6 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
enum bpf_attach_type type);
void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map);
void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map);

int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
Expand Down Expand Up @@ -383,8 +389,6 @@ static inline void bpf_cgroup_storage_set(
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
struct bpf_map *map) { return 0; }
static inline void bpf_cgroup_storage_release(struct bpf_prog_aux *aux,
struct bpf_map *map) {}
static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; }
static inline void bpf_cgroup_storage_free(
Expand Down
67 changes: 39 additions & 28 deletions kernel/bpf/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
}

static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
struct bpf_prog *prog)
struct bpf_cgroup_storage *new_storages[],
enum bpf_attach_type type,
struct bpf_prog *prog,
struct cgroup *cgrp)
{
enum bpf_cgroup_storage_type stype;
struct bpf_cgroup_storage_key key;
struct bpf_map *map;

key.cgroup_inode_id = cgroup_id(cgrp);
key.attach_type = type;

for_each_cgroup_storage_type(stype) {
map = prog->aux->cgroup_storage[stype];
if (!map)
continue;

storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
if (storages[stype])
continue;

storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
if (IS_ERR(storages[stype])) {
storages[stype] = NULL;
bpf_cgroup_storages_free(storages);
bpf_cgroup_storages_free(new_storages);
return -ENOMEM;
}

new_storages[stype] = storages[stype];
}

return 0;
Expand All @@ -63,7 +80,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
}

static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
struct cgroup* cgrp,
struct cgroup *cgrp,
enum bpf_attach_type attach_type)
{
enum bpf_cgroup_storage_type stype;
Expand All @@ -72,14 +89,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
}

static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[])
{
enum bpf_cgroup_storage_type stype;

for_each_cgroup_storage_type(stype)
bpf_cgroup_storage_unlink(storages[stype]);
}

/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
* It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
* doesn't free link memory, which will eventually be done by bpf_link's
Expand All @@ -101,22 +110,23 @@ static void cgroup_bpf_release(struct work_struct *work)
struct cgroup *p, *cgrp = container_of(work, struct cgroup,
bpf.release_work);
struct bpf_prog_array *old_array;
struct list_head *storages = &cgrp->bpf.storages;
struct bpf_cgroup_storage *storage, *stmp;

unsigned int type;

mutex_lock(&cgroup_mutex);

for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog_list *pl, *tmp;
struct bpf_prog_list *pl, *pltmp;

list_for_each_entry_safe(pl, tmp, progs, node) {
list_for_each_entry_safe(pl, pltmp, progs, node) {
list_del(&pl->node);
if (pl->prog)
bpf_prog_put(pl->prog);
if (pl->link)
bpf_cgroup_link_auto_detach(pl->link);
bpf_cgroup_storages_unlink(pl->storage);
bpf_cgroup_storages_free(pl->storage);
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
Expand All @@ -126,6 +136,11 @@ static void cgroup_bpf_release(struct work_struct *work)
bpf_prog_array_free(old_array);
}

list_for_each_entry_safe(storage, stmp, storages, list_cg) {
bpf_cgroup_storage_unlink(storage);
bpf_cgroup_storage_free(storage);
}

mutex_unlock(&cgroup_mutex);

for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
Expand Down Expand Up @@ -290,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);

INIT_LIST_HEAD(&cgrp->bpf.storages);

for (i = 0; i < NR; i++)
if (compute_effective_progs(cgrp, i, &arrays[i]))
goto cleanup;
Expand Down Expand Up @@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_prog_list *pl;
int err;

Expand Down Expand Up @@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
if (IS_ERR(pl))
return PTR_ERR(pl);

if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
if (bpf_cgroup_storages_alloc(storage, new_storage, type,
prog ? : link->link.prog, cgrp))
return -ENOMEM;

if (pl) {
old_prog = pl->prog;
bpf_cgroup_storages_unlink(pl->storage);
bpf_cgroup_storages_assign(old_storage, pl->storage);
} else {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
bpf_cgroup_storages_free(storage);
bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
list_add_tail(&pl->node, progs);
Expand All @@ -480,22 +496,19 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
if (err)
goto cleanup;

bpf_cgroup_storages_free(old_storage);
if (old_prog)
bpf_prog_put(old_prog);
else
static_branch_inc(&cgroup_bpf_enabled_key);
bpf_cgroup_storages_link(pl->storage, cgrp, type);
bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;

cleanup:
if (old_prog) {
pl->prog = old_prog;
pl->link = NULL;
}
bpf_cgroup_storages_free(pl->storage);
bpf_cgroup_storages_assign(pl->storage, old_storage);
bpf_cgroup_storages_link(pl->storage, cgrp, type);
bpf_cgroup_storages_free(new_storage);
if (!old_prog) {
list_del(&pl->node);
kfree(pl);
Expand Down Expand Up @@ -679,8 +692,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,

/* now can actually delete it from this cgroup list */
list_del(&pl->node);
bpf_cgroup_storages_unlink(pl->storage);
bpf_cgroup_storages_free(pl->storage);
kfree(pl);
if (list_empty(progs))
/* last program was detached, reset flags to zero */
Expand Down
12 changes: 0 additions & 12 deletions kernel/bpf/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -2097,24 +2097,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
: 0;
}

static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
{
enum bpf_cgroup_storage_type stype;

for_each_cgroup_storage_type(stype) {
if (!aux->cgroup_storage[stype])
continue;
bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
}
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
u32 i;

bpf_free_cgroup_storage(aux);
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
Expand Down
Loading

0 comments on commit 7d9c342

Please sign in to comment.