From 0a1f7bfe35a3e1302529fa900bf0574a5dfc8ea6 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 18 Jul 2023 01:38:09 -0700 Subject: [PATCH 1/4] bpf: Introduce internal definitions for UAPI-opaque bpf_{rb,list}_node Structs bpf_rb_node and bpf_list_node are opaquely defined in uapi/linux/bpf.h, as BPF program writers are not expected to touch their fields - nor does the verifier allow them to do so. Currently these structs are simple wrappers around structs rb_node and list_head and linked_list / rbtree implementation just casts and passes to library functions for those data structures. Later patches in this series, though, will add an "owner" field to bpf_{rb,list}_node, such that they're not just wrapping an underlying node type. Moreover, the bpf linked_list and rbtree implementations will deal with these owner pointers directly in a few different places. To avoid having to do void *owner = (void*)bpf_list_node + sizeof(struct list_head) with opaque UAPI node types, add bpf_{list,rb}_node_kern struct definitions to internal headers and modify linked_list and rbtree to use the internal types where appropriate. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230718083813.3416104-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ kernel/bpf/helpers.c | 23 +++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 360433f14496a..511ed49c3fe9c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -228,6 +228,16 @@ struct btf_record { struct btf_field fields[]; }; +/* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */ +struct bpf_rb_node_kern { + struct rb_node rb_node; +} __attribute__((aligned(8))); + +/* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */ +struct bpf_list_node_kern { + struct list_head list_head; +} __attribute__((aligned(8))); + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9e80efa59a5d6..d564ff97de0b5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1942,10 +1942,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta return (void *)p__refcounted_kptr; } -static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, +static int __bpf_list_add(struct bpf_list_node_kern *node, + struct bpf_list_head *head, bool tail, struct btf_record *rec, u64 off) { - struct list_head *n = (void *)node, *h = (void *)head; + struct list_head *n = &node->list_head, *h = (void *)head; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here @@ -1967,20 +1968,20 @@ __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { + struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; - return __bpf_list_add(node, head, false, - meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { + struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; - return __bpf_list_add(node, head, true, - meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) @@ -2013,7 +2014,7 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct rb_root_cached *r = (struct rb_root_cached *)root; - struct rb_node *n = (struct rb_node *)node; + struct rb_node *n = &((struct bpf_rb_node_kern *)node)->rb_node; if (RB_EMPTY_NODE(n)) return NULL; @@ -2026,11 +2027,12 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF * program */ -static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, +static int __bpf_rbtree_add(struct bpf_rb_root *root, + struct bpf_rb_node_kern *node, void *less, struct btf_record *rec, u64 off) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; - struct rb_node *parent = NULL, *n = (struct rb_node *)node; + struct rb_node *parent = NULL, *n = &node->rb_node; bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; @@ -2060,8 +2062,9 @@ __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node void *meta__ign, u64 off) { struct btf_struct_meta *meta = meta__ign; + struct bpf_rb_node_kern *n = (void *)node; - return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off); + return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) From c3c510ce431cd99fa10dcd50d995c8e89330ee5b Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 18 Jul 2023 01:38:10 -0700 Subject: [PATCH 2/4] bpf: Add 'owner' field to bpf_{list,rb}_node As described by Kumar in [0], in shared ownership scenarios it is necessary to do runtime tracking of {rb,list} node ownership - and synchronize updates using this ownership information - in order to prevent races. This patch adds an 'owner' field to struct bpf_list_node and bpf_rb_node to implement such runtime tracking. The owner field is a void * that describes the ownership state of a node. It can have the following values: NULL - the node is not owned by any data structure BPF_PTR_POISON - the node is in the process of being added to a data structure ptr_to_root - the pointee is a data structure 'root' (bpf_rb_root / bpf_list_head) which owns this node The field is initially NULL (set by bpf_obj_init_field default behavior) and transitions states in the following sequence: Insertion: NULL -> BPF_PTR_POISON -> ptr_to_root Removal: ptr_to_root -> NULL Before a node has been successfully inserted, it is not protected by any root's lock, and therefore two programs can attempt to add the same node to different roots simultaneously. For this reason the intermediate BPF_PTR_POISON state is necessary. For removal, the node is protected by some root's lock so this intermediate hop isn't necessary. Note that bpf_list_pop_{front,back} helpers don't need to check owner before removing as the node-to-be-removed is not passed in as input and is instead taken directly from the list. Do the check anyways and WARN_ON_ONCE in this unexpected scenario. Selftest changes in this patch are entirely mechanical: some BTF tests have hardcoded struct sizes for structs that contain bpf_{list,rb}_node fields, those were adjusted to account for the new sizes. Selftest additions to validate the owner field are added in a further patch in the series. [0]: https://lore.kernel.org/bpf/d7hyspcow5wtjcmw4fugdgyp3fwhljwuscp3xyut5qnwivyeru@ysdq543otzv2 Signed-off-by: Dave Marchevsky Suggested-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230718083813.3416104-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 2 + kernel/bpf/helpers.c | 29 ++++++- .../selftests/bpf/prog_tests/linked_list.c | 78 +++++++++---------- 4 files changed, 68 insertions(+), 43 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 511ed49c3fe9c..ceaa8c23287fc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -231,11 +231,13 @@ struct btf_record { /* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */ struct bpf_rb_node_kern { struct rb_node rb_node; + void *owner; } __attribute__((aligned(8))); /* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */ struct bpf_list_node_kern { struct list_head list_head; + void *owner; } __attribute__((aligned(8))); struct bpf_map { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 600d0caebbd8a..9ed59896ebc5a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7052,6 +7052,7 @@ struct bpf_list_head { struct bpf_list_node { __u64 :64; __u64 :64; + __u64 :64; } __attribute__((aligned(8))); struct bpf_rb_root { @@ -7063,6 +7064,7 @@ struct bpf_rb_node { __u64 :64; __u64 :64; __u64 :64; + __u64 :64; } __attribute__((aligned(8))); struct bpf_refcount { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d564ff97de0b5..bcff584985e7d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1953,13 +1953,18 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); - if (!list_empty(n)) { + + /* node->owner != NULL implies !list_empty(n), no need to separately + * check the latter + */ + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; } tail ? list_add_tail(n, h) : list_add(n, h); + WRITE_ONCE(node->owner, head); return 0; } @@ -1987,6 +1992,7 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) { struct list_head *n, *h = (void *)head; + struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here @@ -1995,8 +2001,14 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai INIT_LIST_HEAD(h); if (list_empty(h)) return NULL; + n = tail ? h->prev : h->next; + node = container_of(n, struct bpf_list_node_kern, list_head); + if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) + return NULL; + list_del_init(n); + WRITE_ONCE(node->owner, NULL); return (struct bpf_list_node *)n; } @@ -2013,14 +2025,19 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; struct rb_root_cached *r = (struct rb_root_cached *)root; - struct rb_node *n = &((struct bpf_rb_node_kern *)node)->rb_node; + struct rb_node *n = &node_internal->rb_node; - if (RB_EMPTY_NODE(n)) + /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or + * n is owned by some other tree. No need to check RB_EMPTY_NODE(n) + */ + if (READ_ONCE(node_internal->owner) != root) return NULL; rb_erase_cached(n, r); RB_CLEAR_NODE(n); + WRITE_ONCE(node_internal->owner, NULL); return (struct bpf_rb_node *)n; } @@ -2036,7 +2053,10 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; - if (!RB_EMPTY_NODE(n)) { + /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately + * check the latter + */ + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; @@ -2054,6 +2074,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, rb_link_node(n, parent, link); rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); + WRITE_ONCE(node->owner, root); return 0; } diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index f63309fd0e28e..18cf7b17463d9 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -23,7 +23,7 @@ static struct { "bpf_spin_lock at off=" #off " must be held for bpf_list_head" }, \ { #test "_missing_lock_pop_back", \ "bpf_spin_lock at off=" #off " must be held for bpf_list_head" }, - TEST(kptr, 32) + TEST(kptr, 40) TEST(global, 16) TEST(map, 0) TEST(inner_map, 0) @@ -31,7 +31,7 @@ static struct { #define TEST(test, op) \ { #test "_kptr_incorrect_lock_" #op, \ "held lock and object are not in the same allocation\n" \ - "bpf_spin_lock at off=32 must be held for bpf_list_head" }, \ + "bpf_spin_lock at off=40 must be held for bpf_list_head" }, \ { #test "_global_incorrect_lock_" #op, \ "held lock and object are not in the same allocation\n" \ "bpf_spin_lock at off=16 must be held for bpf_list_head" }, \ @@ -84,23 +84,23 @@ static struct { { "double_push_back", "arg#1 expected pointer to allocated object" }, { "no_node_value_type", "bpf_list_node not found at offset=0" }, { "incorrect_value_type", - "operation on bpf_list_head expects arg#1 bpf_list_node at offset=40 in struct foo, " + "operation on bpf_list_head expects arg#1 bpf_list_node at offset=48 in struct foo, " "but arg is at offset=0 in struct bar" }, { "incorrect_node_var_off", "variable ptr_ access var_off=(0x0; 0xffffffff) disallowed" }, - { "incorrect_node_off1", "bpf_list_node not found at offset=41" }, - { "incorrect_node_off2", "arg#1 offset=0, but expected bpf_list_node at offset=40 in struct foo" }, + { "incorrect_node_off1", "bpf_list_node not found at offset=49" }, + { "incorrect_node_off2", "arg#1 offset=0, but expected bpf_list_node at offset=48 in struct foo" }, { "no_head_type", "bpf_list_head not found at offset=0" }, { "incorrect_head_var_off1", "R1 doesn't have constant offset" }, { "incorrect_head_var_off2", "variable ptr_ access var_off=(0x0; 0xffffffff) disallowed" }, - { "incorrect_head_off1", "bpf_list_head not found at offset=17" }, + { "incorrect_head_off1", "bpf_list_head not found at offset=25" }, { "incorrect_head_off2", "bpf_list_head not found at offset=1" }, { "pop_front_off", - "15: (bf) r1 = r6 ; R1_w=ptr_or_null_foo(id=4,ref_obj_id=4,off=40,imm=0) " - "R6_w=ptr_or_null_foo(id=4,ref_obj_id=4,off=40,imm=0) refs=2,4\n" + "15: (bf) r1 = r6 ; R1_w=ptr_or_null_foo(id=4,ref_obj_id=4,off=48,imm=0) " + "R6_w=ptr_or_null_foo(id=4,ref_obj_id=4,off=48,imm=0) refs=2,4\n" "16: (85) call bpf_this_cpu_ptr#154\nR1 type=ptr_or_null_ expected=percpu_ptr_" }, { "pop_back_off", - "15: (bf) r1 = r6 ; R1_w=ptr_or_null_foo(id=4,ref_obj_id=4,off=40,imm=0) " - "R6_w=ptr_or_null_foo(id=4,ref_obj_id=4,off=40,imm=0) refs=2,4\n" + "15: (bf) r1 = r6 ; R1_w=ptr_or_null_foo(id=4,ref_obj_id=4,off=48,imm=0) " + "R6_w=ptr_or_null_foo(id=4,ref_obj_id=4,off=48,imm=0) refs=2,4\n" "16: (85) call bpf_this_cpu_ptr#154\nR1 type=ptr_or_null_ expected=percpu_ptr_" }, }; @@ -257,7 +257,7 @@ static struct btf *init_btf(void) hid = btf__add_struct(btf, "bpf_list_head", 16); if (!ASSERT_EQ(hid, LIST_HEAD, "btf__add_struct bpf_list_head")) goto end; - nid = btf__add_struct(btf, "bpf_list_node", 16); + nid = btf__add_struct(btf, "bpf_list_node", 24); if (!ASSERT_EQ(nid, LIST_NODE, "btf__add_struct bpf_list_node")) goto end; return btf; @@ -276,7 +276,7 @@ static void list_and_rb_node_same_struct(bool refcount_field) if (!ASSERT_OK_PTR(btf, "init_btf")) return; - bpf_rb_node_btf_id = btf__add_struct(btf, "bpf_rb_node", 24); + bpf_rb_node_btf_id = btf__add_struct(btf, "bpf_rb_node", 32); if (!ASSERT_GT(bpf_rb_node_btf_id, 0, "btf__add_struct bpf_rb_node")) return; @@ -286,17 +286,17 @@ static void list_and_rb_node_same_struct(bool refcount_field) return; } - id = btf__add_struct(btf, "bar", refcount_field ? 44 : 40); + id = btf__add_struct(btf, "bar", refcount_field ? 60 : 56); if (!ASSERT_GT(id, 0, "btf__add_struct bar")) return; err = btf__add_field(btf, "a", LIST_NODE, 0, 0); if (!ASSERT_OK(err, "btf__add_field bar::a")) return; - err = btf__add_field(btf, "c", bpf_rb_node_btf_id, 128, 0); + err = btf__add_field(btf, "c", bpf_rb_node_btf_id, 192, 0); if (!ASSERT_OK(err, "btf__add_field bar::c")) return; if (refcount_field) { - err = btf__add_field(btf, "ref", bpf_refcount_btf_id, 320, 0); + err = btf__add_field(btf, "ref", bpf_refcount_btf_id, 448, 0); if (!ASSERT_OK(err, "btf__add_field bar::ref")) return; } @@ -527,7 +527,7 @@ static void test_btf(void) btf = init_btf(); if (!ASSERT_OK_PTR(btf, "init_btf")) break; - id = btf__add_struct(btf, "foo", 36); + id = btf__add_struct(btf, "foo", 44); if (!ASSERT_EQ(id, 5, "btf__add_struct foo")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); @@ -536,7 +536,7 @@ static void test_btf(void) err = btf__add_field(btf, "b", LIST_NODE, 128, 0); if (!ASSERT_OK(err, "btf__add_field foo::b")) break; - err = btf__add_field(btf, "c", SPIN_LOCK, 256, 0); + err = btf__add_field(btf, "c", SPIN_LOCK, 320, 0); if (!ASSERT_OK(err, "btf__add_field foo::c")) break; id = btf__add_decl_tag(btf, "contains:foo:b", 5, 0); @@ -553,7 +553,7 @@ static void test_btf(void) btf = init_btf(); if (!ASSERT_OK_PTR(btf, "init_btf")) break; - id = btf__add_struct(btf, "foo", 36); + id = btf__add_struct(btf, "foo", 44); if (!ASSERT_EQ(id, 5, "btf__add_struct foo")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); @@ -562,13 +562,13 @@ static void test_btf(void) err = btf__add_field(btf, "b", LIST_NODE, 128, 0); if (!ASSERT_OK(err, "btf__add_field foo::b")) break; - err = btf__add_field(btf, "c", SPIN_LOCK, 256, 0); + err = btf__add_field(btf, "c", SPIN_LOCK, 320, 0); if (!ASSERT_OK(err, "btf__add_field foo::c")) break; id = btf__add_decl_tag(btf, "contains:bar:b", 5, 0); if (!ASSERT_EQ(id, 6, "btf__add_decl_tag contains:bar:b")) break; - id = btf__add_struct(btf, "bar", 36); + id = btf__add_struct(btf, "bar", 44); if (!ASSERT_EQ(id, 7, "btf__add_struct bar")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); @@ -577,7 +577,7 @@ static void test_btf(void) err = btf__add_field(btf, "b", LIST_NODE, 128, 0); if (!ASSERT_OK(err, "btf__add_field bar::b")) break; - err = btf__add_field(btf, "c", SPIN_LOCK, 256, 0); + err = btf__add_field(btf, "c", SPIN_LOCK, 320, 0); if (!ASSERT_OK(err, "btf__add_field bar::c")) break; id = btf__add_decl_tag(btf, "contains:foo:b", 7, 0); @@ -594,19 +594,19 @@ static void test_btf(void) btf = init_btf(); if (!ASSERT_OK_PTR(btf, "init_btf")) break; - id = btf__add_struct(btf, "foo", 20); + id = btf__add_struct(btf, "foo", 28); if (!ASSERT_EQ(id, 5, "btf__add_struct foo")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); if (!ASSERT_OK(err, "btf__add_field foo::a")) break; - err = btf__add_field(btf, "b", SPIN_LOCK, 128, 0); + err = btf__add_field(btf, "b", SPIN_LOCK, 192, 0); if (!ASSERT_OK(err, "btf__add_field foo::b")) break; id = btf__add_decl_tag(btf, "contains:bar:a", 5, 0); if (!ASSERT_EQ(id, 6, "btf__add_decl_tag contains:bar:a")) break; - id = btf__add_struct(btf, "bar", 16); + id = btf__add_struct(btf, "bar", 24); if (!ASSERT_EQ(id, 7, "btf__add_struct bar")) break; err = btf__add_field(btf, "a", LIST_NODE, 0, 0); @@ -623,19 +623,19 @@ static void test_btf(void) btf = init_btf(); if (!ASSERT_OK_PTR(btf, "init_btf")) break; - id = btf__add_struct(btf, "foo", 20); + id = btf__add_struct(btf, "foo", 28); if (!ASSERT_EQ(id, 5, "btf__add_struct foo")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); if (!ASSERT_OK(err, "btf__add_field foo::a")) break; - err = btf__add_field(btf, "b", SPIN_LOCK, 128, 0); + err = btf__add_field(btf, "b", SPIN_LOCK, 192, 0); if (!ASSERT_OK(err, "btf__add_field foo::b")) break; id = btf__add_decl_tag(btf, "contains:bar:b", 5, 0); if (!ASSERT_EQ(id, 6, "btf__add_decl_tag contains:bar:b")) break; - id = btf__add_struct(btf, "bar", 36); + id = btf__add_struct(btf, "bar", 44); if (!ASSERT_EQ(id, 7, "btf__add_struct bar")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); @@ -644,13 +644,13 @@ static void test_btf(void) err = btf__add_field(btf, "b", LIST_NODE, 128, 0); if (!ASSERT_OK(err, "btf__add_field bar::b")) break; - err = btf__add_field(btf, "c", SPIN_LOCK, 256, 0); + err = btf__add_field(btf, "c", SPIN_LOCK, 320, 0); if (!ASSERT_OK(err, "btf__add_field bar::c")) break; id = btf__add_decl_tag(btf, "contains:baz:a", 7, 0); if (!ASSERT_EQ(id, 8, "btf__add_decl_tag contains:baz:a")) break; - id = btf__add_struct(btf, "baz", 16); + id = btf__add_struct(btf, "baz", 24); if (!ASSERT_EQ(id, 9, "btf__add_struct baz")) break; err = btf__add_field(btf, "a", LIST_NODE, 0, 0); @@ -667,7 +667,7 @@ static void test_btf(void) btf = init_btf(); if (!ASSERT_OK_PTR(btf, "init_btf")) break; - id = btf__add_struct(btf, "foo", 36); + id = btf__add_struct(btf, "foo", 44); if (!ASSERT_EQ(id, 5, "btf__add_struct foo")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); @@ -676,13 +676,13 @@ static void test_btf(void) err = btf__add_field(btf, "b", LIST_NODE, 128, 0); if (!ASSERT_OK(err, "btf__add_field foo::b")) break; - err = btf__add_field(btf, "c", SPIN_LOCK, 256, 0); + err = btf__add_field(btf, "c", SPIN_LOCK, 320, 0); if (!ASSERT_OK(err, "btf__add_field foo::c")) break; id = btf__add_decl_tag(btf, "contains:bar:b", 5, 0); if (!ASSERT_EQ(id, 6, "btf__add_decl_tag contains:bar:b")) break; - id = btf__add_struct(btf, "bar", 36); + id = btf__add_struct(btf, "bar", 44); if (!ASSERT_EQ(id, 7, "btf__add_struct bar")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); @@ -691,13 +691,13 @@ static void test_btf(void) err = btf__add_field(btf, "b", LIST_NODE, 128, 0); if (!ASSERT_OK(err, "btf__add_field bar:b")) break; - err = btf__add_field(btf, "c", SPIN_LOCK, 256, 0); + err = btf__add_field(btf, "c", SPIN_LOCK, 320, 0); if (!ASSERT_OK(err, "btf__add_field bar:c")) break; id = btf__add_decl_tag(btf, "contains:baz:a", 7, 0); if (!ASSERT_EQ(id, 8, "btf__add_decl_tag contains:baz:a")) break; - id = btf__add_struct(btf, "baz", 16); + id = btf__add_struct(btf, "baz", 24); if (!ASSERT_EQ(id, 9, "btf__add_struct baz")) break; err = btf__add_field(btf, "a", LIST_NODE, 0, 0); @@ -726,7 +726,7 @@ static void test_btf(void) id = btf__add_decl_tag(btf, "contains:bar:b", 5, 0); if (!ASSERT_EQ(id, 6, "btf__add_decl_tag contains:bar:b")) break; - id = btf__add_struct(btf, "bar", 36); + id = btf__add_struct(btf, "bar", 44); if (!ASSERT_EQ(id, 7, "btf__add_struct bar")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); @@ -735,13 +735,13 @@ static void test_btf(void) err = btf__add_field(btf, "b", LIST_NODE, 128, 0); if (!ASSERT_OK(err, "btf__add_field bar::b")) break; - err = btf__add_field(btf, "c", SPIN_LOCK, 256, 0); + err = btf__add_field(btf, "c", SPIN_LOCK, 320, 0); if (!ASSERT_OK(err, "btf__add_field bar::c")) break; id = btf__add_decl_tag(btf, "contains:baz:b", 7, 0); if (!ASSERT_EQ(id, 8, "btf__add_decl_tag")) break; - id = btf__add_struct(btf, "baz", 36); + id = btf__add_struct(btf, "baz", 44); if (!ASSERT_EQ(id, 9, "btf__add_struct baz")) break; err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); @@ -750,13 +750,13 @@ static void test_btf(void) err = btf__add_field(btf, "b", LIST_NODE, 128, 0); if (!ASSERT_OK(err, "btf__add_field bar::b")) break; - err = btf__add_field(btf, "c", SPIN_LOCK, 256, 0); + err = btf__add_field(btf, "c", SPIN_LOCK, 320, 0); if (!ASSERT_OK(err, "btf__add_field bar::c")) break; id = btf__add_decl_tag(btf, "contains:bam:a", 9, 0); if (!ASSERT_EQ(id, 10, "btf__add_decl_tag contains:bam:a")) break; - id = btf__add_struct(btf, "bam", 16); + id = btf__add_struct(btf, "bam", 24); if (!ASSERT_EQ(id, 11, "btf__add_struct bam")) break; err = btf__add_field(btf, "a", LIST_NODE, 0, 0); From fdf48dc2d054b73453f6e11ae39e96a559fc7144 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 18 Jul 2023 01:38:11 -0700 Subject: [PATCH 3/4] selftests/bpf: Add rbtree test exercising race which 'owner' field prevents This patch adds a runnable version of one of the races described by Kumar in [0]. Specifically, this interleaving: (rbtree1 and list head protected by lock1, rbtree2 protected by lock2) Prog A Prog B ====================================== n = bpf_obj_new(...) m = bpf_refcount_acquire(n) kptr_xchg(map, m) m = kptr_xchg(map, NULL) lock(lock2) bpf_rbtree_add(rbtree2, m->r, less) unlock(lock2) lock(lock1) bpf_list_push_back(head, n->l) /* make n non-owning ref */ bpf_rbtree_remove(rbtree1, n->r) unlock(lock1) The above interleaving, the node's struct bpf_rb_node *r can be used to add it to either rbtree1 or rbtree2, which are protected by different locks. If the node has been added to rbtree2, we should not be allowed to remove it while holding rbtree1's lock. Before changes in the previous patch in this series, the rbtree_remove in the second part of Prog A would succeed as the verifier has no way of knowing which tree owns a particular node at verification time. The addition of 'owner' field results in bpf_rbtree_remove correctly failing. The test added in this patch splits "Prog A" above into two separate BPF programs - A1 and A2 - and uses a second mapval + kptr_xchg to pass n from A1 to A2 similarly to the pass from A1 to B. If the test is run without the fix applied, the remove will succeed. Kumar's example had the two programs running on separate CPUs. This patch doesn't do this as it's not necessary to exercise the broken behavior / validate fixed behavior. [0]: https://lore.kernel.org/bpf/d7hyspcow5wtjcmw4fugdgyp3fwhljwuscp3xyut5qnwivyeru@ysdq543otzv2 Signed-off-by: Dave Marchevsky Suggested-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230718083813.3416104-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/refcounted_kptr.c | 28 ++++++ .../selftests/bpf/progs/refcounted_kptr.c | 94 ++++++++++++++++++- 2 files changed, 121 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c index 595cbf92bff5f..1cb5c657dc38e 100644 --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c @@ -14,3 +14,31 @@ void test_refcounted_kptr(void) void test_refcounted_kptr_fail(void) { } + +void test_refcounted_kptr_wrong_owner(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct refcounted_kptr *skel; + int ret; + + skel = refcounted_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_wrong_owner_remove_fail_a1), &opts); + ASSERT_OK(ret, "rbtree_wrong_owner_remove_fail_a1"); + ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a1 retval"); + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_wrong_owner_remove_fail_b), &opts); + ASSERT_OK(ret, "rbtree_wrong_owner_remove_fail_b"); + ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_b retval"); + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_wrong_owner_remove_fail_a2), &opts); + ASSERT_OK(ret, "rbtree_wrong_owner_remove_fail_a2"); + ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval"); + refcounted_kptr__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c index a3da610b1e6b0..c55652fdc63ab 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c @@ -24,7 +24,7 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, int); __type(value, struct map_value); - __uint(max_entries, 1); + __uint(max_entries, 2); } stashed_nodes SEC(".maps"); struct node_acquire { @@ -42,6 +42,9 @@ private(A) struct bpf_list_head head __contains(node_data, l); private(B) struct bpf_spin_lock alock; private(B) struct bpf_rb_root aroot __contains(node_acquire, node); +private(C) struct bpf_spin_lock block; +private(C) struct bpf_rb_root broot __contains(node_data, r); + static bool less(struct bpf_rb_node *node_a, const struct bpf_rb_node *node_b) { struct node_data *a; @@ -405,4 +408,93 @@ long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx) return 0; } +static long __stash_map_empty_xchg(struct node_data *n, int idx) +{ + struct map_value *mapval = bpf_map_lookup_elem(&stashed_nodes, &idx); + + if (!mapval) { + bpf_obj_drop(n); + return 1; + } + n = bpf_kptr_xchg(&mapval->node, n); + if (n) { + bpf_obj_drop(n); + return 2; + } + return 0; +} + +SEC("tc") +long rbtree_wrong_owner_remove_fail_a1(void *ctx) +{ + struct node_data *n, *m; + + n = bpf_obj_new(typeof(*n)); + if (!n) + return 1; + m = bpf_refcount_acquire(n); + + if (__stash_map_empty_xchg(n, 0)) { + bpf_obj_drop(m); + return 2; + } + + if (__stash_map_empty_xchg(m, 1)) + return 3; + + return 0; +} + +SEC("tc") +long rbtree_wrong_owner_remove_fail_b(void *ctx) +{ + struct map_value *mapval; + struct node_data *n; + int idx = 0; + + mapval = bpf_map_lookup_elem(&stashed_nodes, &idx); + if (!mapval) + return 1; + + n = bpf_kptr_xchg(&mapval->node, NULL); + if (!n) + return 2; + + bpf_spin_lock(&block); + + bpf_rbtree_add(&broot, &n->r, less); + + bpf_spin_unlock(&block); + return 0; +} + +SEC("tc") +long rbtree_wrong_owner_remove_fail_a2(void *ctx) +{ + struct map_value *mapval; + struct bpf_rb_node *res; + struct node_data *m; + int idx = 1; + + mapval = bpf_map_lookup_elem(&stashed_nodes, &idx); + if (!mapval) + return 1; + + m = bpf_kptr_xchg(&mapval->node, NULL); + if (!m) + return 2; + bpf_spin_lock(&lock); + + /* make m non-owning ref */ + bpf_list_push_back(&head, &m->l); + res = bpf_rbtree_remove(&root, &m->r); + + bpf_spin_unlock(&lock); + if (res) { + bpf_obj_drop(container_of(res, struct node_data, r)); + return 3; + } + return 0; +} + char _license[] SEC("license") = "GPL"; From f3514a5d67403f803eadd39cf61986638101e755 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 18 Jul 2023 01:38:12 -0700 Subject: [PATCH 4/4] selftests/bpf: Disable newly-added 'owner' field test until refcount re-enabled The test added in previous patch will fail with bpf_refcount_acquire disabled. Until all races are fixed and bpf_refcount_acquire is re-enabled on bpf-next, disable the test so CI doesn't complain. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230718083813.3416104-6-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/refcounted_kptr.c | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c index 1cb5c657dc38e..7423983472c7c 100644 --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c @@ -17,28 +17,4 @@ void test_refcounted_kptr_fail(void) void test_refcounted_kptr_wrong_owner(void) { - LIBBPF_OPTS(bpf_test_run_opts, opts, - .data_in = &pkt_v4, - .data_size_in = sizeof(pkt_v4), - .repeat = 1, - ); - struct refcounted_kptr *skel; - int ret; - - skel = refcounted_kptr__open_and_load(); - if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load")) - return; - - ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_wrong_owner_remove_fail_a1), &opts); - ASSERT_OK(ret, "rbtree_wrong_owner_remove_fail_a1"); - ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a1 retval"); - - ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_wrong_owner_remove_fail_b), &opts); - ASSERT_OK(ret, "rbtree_wrong_owner_remove_fail_b"); - ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_b retval"); - - ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_wrong_owner_remove_fail_a2), &opts); - ASSERT_OK(ret, "rbtree_wrong_owner_remove_fail_a2"); - ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval"); - refcounted_kptr__destroy(skel); }