From ba2b31b0f39fca12abbd21c53a92838bbc026023 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:45 +0800 Subject: [PATCH 1/6] bpf: Factor out htab_elem_value helper() All hash maps store map key and map value together. The relative offset of the map value compared to the map key is round_up(key_size, 8). Therefore, factor out a common helper htab_elem_value() to calculate the address of the map value instead of duplicating the logic. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 64 +++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5a5adc66b8e2..0bebc919bbf7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -175,20 +175,25 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) +{ + return l->key + round_up(key_size, 8); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; + *(void __percpu **)htab_elem_value(l, key_size) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + roundup(key_size, 8)); + return *(void __percpu **)htab_elem_value(l, key_size); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { - return *(void **)(l->key + roundup(map->key_size, 8)); + return *(void **)htab_elem_value(l, map->key_size); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) @@ -215,10 +220,10 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) elem = get_htab_elem(htab, i); if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -245,7 +250,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) cond_resched(); } } else { - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } cond_resched(); @@ -670,7 +676,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); return NULL; } @@ -709,7 +715,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); } return NULL; @@ -763,7 +769,7 @@ static void check_and_free_fields(struct bpf_htab *htab, for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + void *map_value = htab_elem_value(elem, htab->map.key_size); bpf_obj_free_fields(htab->map.record, map_value); } @@ -1039,11 +1045,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); - memcpy(l_new->key + round_up(key_size, 8), value, size); + memcpy(htab_elem_value(l_new, key_size), value, size); } else { - copy_map_value(&htab->map, - l_new->key + round_up(key_size, 8), - value); + copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value); } l_new->hash = hash; @@ -1106,7 +1110,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); return 0; } @@ -1134,7 +1138,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, * and update element in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); ret = 0; goto err; @@ -1220,8 +1224,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - copy_map_value(&htab->map, - l_new->key + round_up(map->key_size, 8), value); + copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value); ret = htab_lock_bucket(b, &flags); if (ret) @@ -1500,10 +1503,10 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1615,15 +1618,12 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, off += roundup_value_size; } } else { - u32 roundup_key_size = round_up(map->key_size, 8); + void *src = htab_elem_value(l, map->key_size); if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); + copy_map_value_locked(map, value, src, true); else - copy_map_value(map, value, l->key + - roundup_key_size); + copy_map_value(map, value, src); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } @@ -1680,12 +1680,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1720,7 +1720,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, return -ENOENT; key_size = htab->map.key_size; - roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) @@ -1812,7 +1811,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, off += size; } } else { - value = l->key + roundup_key_size; + value = htab_elem_value(l, key_size); if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { struct bpf_map **inner_map = value; @@ -2063,11 +2062,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; - u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; + u32 roundup_value_size; struct bpf_prog *prog; void __percpu *pptr; @@ -2077,10 +2076,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) ctx.meta = &meta; ctx.map = info->map; if (elem) { - roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { - ctx.value = elem->key + roundup_key_size; + ctx.value = htab_elem_value(elem, map->key_size); } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); @@ -2165,7 +2163,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; - u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; @@ -2180,7 +2177,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ is_percpu = htab_is_percpu(htab); - roundup_key_size = round_up(map->key_size, 8); /* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem(). @@ -2196,7 +2192,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { - val = elem->key + roundup_key_size; + val = htab_elem_value(elem, map->key_size); } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, From 5771e306b6cd8ce5b9935d006765f887f145e6d5 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:46 +0800 Subject: [PATCH 2/6] bpf: Rename __htab_percpu_map_update_elem to htab_map_update_elem_in_place Rename __htab_percpu_map_update_elem to htab_map_update_elem_in_place, and add a new percpu argument for the helper to support in-place update for both per-cpu htab and htab of maps. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0bebc919bbf7..9778e9871d86 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1258,12 +1258,12 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value return ret; } -static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, - bool onallcpus) + bool percpu, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; @@ -1295,19 +1295,18 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - /* per-cpu hash map can update value in-place */ + /* Update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, NULL); + hash, percpu, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } - ret = 0; err: htab_unlock_bucket(b, flags); return ret; @@ -1386,7 +1385,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, false); + return htab_map_update_elem_in_place(map, key, value, map_flags, true, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, @@ -2407,8 +2406,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, - true); + ret = htab_map_update_elem_in_place(map, key, value, map_flags, + true, true); rcu_read_unlock(); return ret; From 2c304172e03193bd02363ee8969444261f7b7a57 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:47 +0800 Subject: [PATCH 3/6] bpf: Support atomic update for htab of maps As reported by Cody Haas [1], when there is concurrent map lookup and map update operation in an existing element for htab of maps, the map lookup procedure may return -ENOENT unexpectedly. The root cause is twofold: 1) the update of existing element involves two separated list operation In htab_map_update_elem(), it first inserts the new element at the head of list, then it deletes the old element. Therefore, it is possible a lookup operation has already iterated to the middle of the list when a concurrent update operation begins, and the lookup operation will fail to find the target element. 2) the immediate reuse of htab element. It is more subtle. Even through the lookup operation finds the old element, it is possible that the target element has been removed by a concurrent update operation, and the element has been reused immediately by other update operation which runs on the same CPU as the previous update operation, and the element is inserted into the same bucket list. After these steps above, when the lookup operation tries to compare the key in the old element with the expected key, the match will fail because the key in the old element have been overwritten by other update operation. The two-step update process is relatively straightforward to address. The more challenging aspect is the immediate reuse. As Alexei pointed out: So since 2022 both prealloc and no_prealloc reuse elements. We can consider a new flag for the hash map like F_REUSE_AFTER_RCU_GP that will use _rcu() flavor of freeing into bpf_ma, but it has to have a strong reason. Given that htab of maps doesn't support special field in value and directly stores the inner map pointer in htab_element, just do in-place update for htab of maps instead of attempting to address the immediate reuse issue. [1]: https://lore.kernel.org/xdp-newbies/CAH7f-ULFTwKdoH_t2SFc5rWCVYLEg-14d1fBYWH2eekudsnTRg@mail.gmail.com/ Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9778e9871d86..4879c79dd677 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1076,10 +1076,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; - void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1160,24 +1159,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. Also - * save the old map pointer in htab of maps before unlock - * and release it after unlock. + * its special fields before it is available for reuse. */ - old_map_ptr = NULL; - if (htab_is_prealloc(htab)) { - if (map->ops->map_fd_put_ptr) - old_map_ptr = fd_htab_map_get_ptr(map, l_old); + if (htab_is_prealloc(htab)) check_and_free_fields(htab, l_old); - } } htab_unlock_bucket(b, flags); - if (l_old) { - if (old_map_ptr) - map->ops->map_fd_put_ptr(map, old_map_ptr, true); - if (!htab_is_prealloc(htab)) - free_htab_elem(htab, l_old); - } + if (l_old && !htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); return 0; err: htab_unlock_bucket(b, flags); @@ -1265,6 +1254,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; + void *old_map_ptr = NULL; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -1296,8 +1286,15 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, if (l_old) { /* Update value in-place */ - pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + if (percpu) { + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + void **inner_map_pptr = htab_elem_value(l_old, key_size); + + old_map_ptr = *inner_map_pptr; + WRITE_ONCE(*inner_map_pptr, *(void **)value); + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, percpu, onallcpus, NULL); @@ -1309,6 +1306,8 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, } err: htab_unlock_bucket(b, flags); + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); return ret; } @@ -2531,24 +2530,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) return ret; } -/* only called from syscall */ +/* Only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; - u32 ufd = *(u32 *)value; - ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid - * the WARN_ON_ONCE in htab_map_update_elem(). + * the WARN_ON_ONCE in htab_map_update_elem_in_place(). */ rcu_read_lock(); - ret = htab_map_update_elem(map, key, &ptr, map_flags); + ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false); rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); From e8a65856c75d518d0bb15f38c90a4fd264ba1d3a Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:48 +0800 Subject: [PATCH 4/6] bpf: Add is_fd_htab() helper Add is_fd_htab() helper to check whether the map is htab of maps. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4879c79dd677..097992efef05 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -175,6 +175,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline bool is_fd_htab(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) { return l->key + round_up(key_size, 8); @@ -974,8 +979,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && - BITS_PER_LONG == 64; + return is_fd_htab(htab) && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, @@ -1810,7 +1814,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, } } else { value = htab_elem_value(l, key_size); - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + if (is_fd_htab(htab)) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ From 6704b1e8cfc5eed264065735fe00a1dd8a0bffef Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:49 +0800 Subject: [PATCH 5/6] bpf: Don't allocate per-cpu extra_elems for fd htab The update of element in fd htab is in-place now, therefore, there is no need to allocate per-cpu extra_elems, just remove it. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 097992efef05..2e18d7e50d9b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -206,9 +206,13 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +/* Both percpu and fd htab support in-place update, so no need for + * extra elem. LRU itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ static bool htab_has_extra_elems(struct bpf_htab *htab) { - return !htab_is_percpu(htab) && !htab_is_lru(htab); + return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) @@ -464,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); - bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || - attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -560,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked; - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ + if (htab_has_extra_elems(htab)) { err = alloc_extra_elems(htab); if (err) goto free_prealloc; From 7c6fb1cf33fb9a7b89a0d9feada957d0fe56de6f Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:50 +0800 Subject: [PATCH 6/6] selftests/bpf: Add test case for atomic update of fd htab Add a test case to verify the atomic update of existing elements in the htab of maps. The test proceeds in three steps: 1) fill the outer map with keys in the range [0, 8] For each inner array map, the value of its first element is set as the key used to lookup the inner map. 2) create 16 threads to lookup these keys concurrently Each lookup thread first lookups the inner map, then it checks whether the first value of the inner array map is the same as the key used to lookup the inner map. 3) create 8 threads to overwrite these keys concurrently Each update thread first creates an inner array, it sets the first value of the array to the key used to update the outer map, then it uses the key and the inner map to update the outer map. Without atomic update support, the lookup operation may return -ENOENT during the lookup of outer map, or return -EINVAL during the comparison of the first value in the inner map and the key used for inner map, and the test will fail. After the atomic update change, both the lookup and the comparison will succeed. Given that the update of outer map is slow, the test case sets the loop number for each thread as 5 to reduce the total running time. However, the loop number could also be adjusted through FD_HTAB_LOOP_NR environment variable. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-7-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fd_htab_lookup.c | 192 ++++++++++++++++++ .../selftests/bpf/progs/fd_htab_lookup.c | 25 +++ 2 files changed, 217 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/fd_htab_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/fd_htab_lookup.c diff --git a/tools/testing/selftests/bpf/prog_tests/fd_htab_lookup.c b/tools/testing/selftests/bpf/prog_tests/fd_htab_lookup.c new file mode 100644 index 000000000000..ca46fdd6e1ae --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fd_htab_lookup.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include +#include +#include "fd_htab_lookup.skel.h" + +struct htab_op_ctx { + int fd; + int loop; + unsigned int entries; + bool stop; +}; + +#define ERR_TO_RETVAL(where, err) ((void *)(long)(((where) << 12) | (-err))) + +static void *htab_lookup_fn(void *arg) +{ + struct htab_op_ctx *ctx = arg; + int i = 0; + + while (i++ < ctx->loop && !ctx->stop) { + unsigned int j; + + for (j = 0; j < ctx->entries; j++) { + unsigned int key = j, zero = 0, value; + int inner_fd, err; + + err = bpf_map_lookup_elem(ctx->fd, &key, &value); + if (err) { + ctx->stop = true; + return ERR_TO_RETVAL(1, err); + } + + inner_fd = bpf_map_get_fd_by_id(value); + if (inner_fd < 0) { + /* The old map has been freed */ + if (inner_fd == -ENOENT) + continue; + ctx->stop = true; + return ERR_TO_RETVAL(2, inner_fd); + } + + err = bpf_map_lookup_elem(inner_fd, &zero, &value); + if (err) { + close(inner_fd); + ctx->stop = true; + return ERR_TO_RETVAL(3, err); + } + close(inner_fd); + + if (value != key) { + ctx->stop = true; + return ERR_TO_RETVAL(4, -EINVAL); + } + } + } + + return NULL; +} + +static void *htab_update_fn(void *arg) +{ + struct htab_op_ctx *ctx = arg; + int i = 0; + + while (i++ < ctx->loop && !ctx->stop) { + unsigned int j; + + for (j = 0; j < ctx->entries; j++) { + unsigned int key = j, zero = 0; + int inner_fd, err; + + inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL); + if (inner_fd < 0) { + ctx->stop = true; + return ERR_TO_RETVAL(1, inner_fd); + } + + err = bpf_map_update_elem(inner_fd, &zero, &key, 0); + if (err) { + close(inner_fd); + ctx->stop = true; + return ERR_TO_RETVAL(2, err); + } + + err = bpf_map_update_elem(ctx->fd, &key, &inner_fd, BPF_EXIST); + if (err) { + close(inner_fd); + ctx->stop = true; + return ERR_TO_RETVAL(3, err); + } + close(inner_fd); + } + } + + return NULL; +} + +static int setup_htab(int fd, unsigned int entries) +{ + unsigned int i; + + for (i = 0; i < entries; i++) { + unsigned int key = i, zero = 0; + int inner_fd, err; + + inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL); + if (!ASSERT_OK_FD(inner_fd, "new array")) + return -1; + + err = bpf_map_update_elem(inner_fd, &zero, &key, 0); + if (!ASSERT_OK(err, "init array")) { + close(inner_fd); + return -1; + } + + err = bpf_map_update_elem(fd, &key, &inner_fd, 0); + if (!ASSERT_OK(err, "init outer")) { + close(inner_fd); + return -1; + } + close(inner_fd); + } + + return 0; +} + +static int get_int_from_env(const char *name, int dft) +{ + const char *value; + + value = getenv(name); + if (!value) + return dft; + + return atoi(value); +} + +void test_fd_htab_lookup(void) +{ + unsigned int i, wr_nr = 8, rd_nr = 16; + pthread_t tids[wr_nr + rd_nr]; + struct fd_htab_lookup *skel; + struct htab_op_ctx ctx; + int err; + + skel = fd_htab_lookup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fd_htab_lookup__open_and_load")) + return; + + ctx.fd = bpf_map__fd(skel->maps.outer_map); + ctx.loop = get_int_from_env("FD_HTAB_LOOP_NR", 5); + ctx.stop = false; + ctx.entries = 8; + + err = setup_htab(ctx.fd, ctx.entries); + if (err) + goto destroy; + + memset(tids, 0, sizeof(tids)); + for (i = 0; i < wr_nr; i++) { + err = pthread_create(&tids[i], NULL, htab_update_fn, &ctx); + if (!ASSERT_OK(err, "pthread_create")) { + ctx.stop = true; + goto reap; + } + } + for (i = 0; i < rd_nr; i++) { + err = pthread_create(&tids[i + wr_nr], NULL, htab_lookup_fn, &ctx); + if (!ASSERT_OK(err, "pthread_create")) { + ctx.stop = true; + goto reap; + } + } + +reap: + for (i = 0; i < wr_nr + rd_nr; i++) { + void *ret = NULL; + char desc[32]; + + if (!tids[i]) + continue; + + snprintf(desc, sizeof(desc), "thread %u", i + 1); + err = pthread_join(tids[i], &ret); + ASSERT_OK(err, desc); + ASSERT_EQ(ret, NULL, desc); + } +destroy: + fd_htab_lookup__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/fd_htab_lookup.c b/tools/testing/selftests/bpf/progs/fd_htab_lookup.c new file mode 100644 index 000000000000..a4a9e1db626f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fd_htab_lookup.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct inner_map_type { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, 4); + __uint(value_size, 4); + __uint(max_entries, 1); +} inner_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 64); + __type(key, int); + __type(value, int); + __array(values, struct inner_map_type); +} outer_map SEC(".maps") = { + .values = { + [0] = &inner_map, + }, +};