From dd2ce6a5373c6f5c830be54be10775458a8bd312 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:01 +0200 Subject: [PATCH 1/6] bpf: Improve bpf_redirect_neigh helper description Follow-up to address David's feedback that we should better describe internals of the bpf_redirect_neigh() helper. Suggested-by: David Ahern Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/20201010234006.7075-2-daniel@iogearbox.net --- include/uapi/linux/bpf.h | 10 +++++++--- tools/include/uapi/linux/bpf.h | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42d2df799397c..4272cc53d4786 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3679,10 +3679,14 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 42d2df799397c..4272cc53d4786 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3679,10 +3679,14 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. From 9aa1206e8f48222f35a0c809f33b2f4aaa1e2661 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:02 +0200 Subject: [PATCH 2/6] bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net --- drivers/net/veth.c | 9 ++++++ include/linux/netdevice.h | 4 +++ include/uapi/linux/bpf.h | 17 +++++++++++ net/core/dev.c | 15 ++++++++-- net/core/filter.c | 54 +++++++++++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 17 +++++++++++ 6 files changed, 106 insertions(+), 10 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 091e5b4ba0423..8c737668008a0 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev) return smp_processor_id() % dev->real_num_rx_queues; } +static struct net_device *veth_peer_dev(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + /* Callers must be under RCU read side. */ + return rcu_dereference(priv->peer); +} + static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags, bool ndo_xmit) @@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, + .ndo_get_peer_dev = veth_peer_dev, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28cfa53daf72e..0533f86018dd9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1277,6 +1277,9 @@ struct netdev_net_notifier { * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. + * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); + * If a device is paired with a peer device, return the peer instance. + * The caller must be under RCU read context. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1484,6 +1487,7 @@ struct net_device_ops { struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); + struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); }; /** diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4272cc53d4786..b97bc5abb3b89 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/dev.c b/net/core/dev.c index 9d55bf5d1a658..7dd015823593d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); static inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) + struct net_device *orig_dev, bool *another) { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); @@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, * redirecting to another netdev */ __skb_push(skb, skb->mac_len); - skb_do_redirect(skb); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } return NULL; case TC_ACT_CONSUMED: return NULL; @@ -5163,7 +5167,12 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { - skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); + bool another = false; + + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; if (!skb) goto out; diff --git a/net/core/filter.c b/net/core/filter.c index 5da44b11e1ecb..fab951c6be573 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2380,8 +2380,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) /* Internal, non-exposed redirect flags. */ enum { - BPF_F_NEIGH = (1ULL << 1), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) + BPF_F_NEIGH = (1ULL << 1), + BPF_F_PEER = (1ULL << 2), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); + dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; - if (unlikely(!dev)) { - kfree_skb(skb); - return -EINVAL; + ri->flags = 0; + if (unlikely(!dev)) + goto out_drop; + if (flags & BPF_F_PEER) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (unlikely(!ops->ndo_get_peer_dev || + !skb_at_tc_ingress(skb))) + goto out_drop; + dev = ops->ndo_get_peer_dev(dev); + if (unlikely(!dev || + !is_skb_forwardable(dev, skb) || + net_eq(net, dev_net(dev)))) + goto out_drop; + skb->dev = dev; + return -EAGAIN; } - return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev) : __bpf_redirect(skb, dev, flags); +out_drop: + kfree_skb(skb); + return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) @@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_PEER; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_peer_proto = { + .func = bpf_redirect_peer, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; + case BPF_FUNC_redirect_peer: + return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4272cc53d4786..b97bc5abb3b89 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper From 4a8f87e60f6db40e640f1db555d063b2c4dea5f1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:03 +0200 Subject: [PATCH 3/6] bpf: Allow for map-in-map with dynamic inner array map entries Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf ("bpf: Relax max_entries check for most of the inner map types") added support for dynamic inner max elements for most map-in-map types. Exceptions were maps like array or prog array where the map_gen_lookup() callback uses the maps' max_entries field as a constant when emitting instructions. We recently implemented Maglev consistent hashing into Cilium's load balancer which uses map-in-map with an outer map being hash and inner being array holding the Maglev backend table for each service. This has been designed this way in order to reduce overall memory consumption given the outer hash map allows to avoid preallocating a large, flat memory area for all services. Also, the number of service mappings is not always known a-priori. The use case for dynamic inner array map entries is to further reduce memory overhead, for example, some services might just have a small number of back ends while others could have a large number. Right now the Maglev backend table for small and large number of backends would need to have the same inner array map entries which adds a lot of unneeded overhead. Dynamic inner array map entries can be realized by avoiding the inlined code generation for their lookup. The lookup will still be efficient since it will be calling into array_map_lookup_elem() directly and thus avoiding retpoline. The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips inline code generation and relaxes array_map_meta_equal() check to ignore both maps' max_entries. This also still allows to have faster lookups for map-in-map when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed. Example code generation where inner map is dynamic sized array: # bpftool p d x i 125 int handle__sys_enter(void * ctx): ; int handle__sys_enter(void *ctx) 0: (b4) w1 = 0 ; int key = 0; 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 ; 3: (07) r2 += -4 ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 4: (18) r1 = map[id:468] 6: (07) r1 += 272 7: (61) r0 = *(u32 *)(r2 +0) 8: (35) if r0 >= 0x3 goto pc+5 9: (67) r0 <<= 3 10: (0f) r0 += r1 11: (79) r0 = *(u64 *)(r0 +0) 12: (15) if r0 == 0x0 goto pc+1 13: (05) goto pc+1 14: (b7) r0 = 0 15: (b4) w6 = -1 ; if (!inner_map) 16: (15) if r0 == 0x0 goto pc+6 17: (bf) r2 = r10 ; 18: (07) r2 += -4 ; val = bpf_map_lookup_elem(inner_map, &key); 19: (bf) r1 = r0 | No inlining but instead 20: (85) call array_map_lookup_elem#149280 | call to array_map_lookup_elem() ; return val ? *val : -1; | for inner array lookup. 21: (15) if r0 == 0x0 goto pc+1 ; return val ? *val : -1; 22: (61) r6 = *(u32 *)(r0 +0) ; } 23: (bc) w0 = w6 24: (95) exit Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net --- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 17 +++++++++++------ kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/verifier.c | 6 ++++-- net/xdp/xskmap.c | 2 +- tools/include/uapi/linux/bpf.h | 3 +++ 7 files changed, 26 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc63eeed4fd95..2b16bf48aab61 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -82,7 +82,7 @@ struct bpf_map_ops { void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); - u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); + int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b97bc5abb3b89..bf5a99d803e48 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -435,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bd777dd6f9677..c6c81eceb68fb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -16,7 +16,7 @@ #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ - BPF_F_PRESERVE_ELEMS) + BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && - attr->map_flags & BPF_F_MMAPABLE) + attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && @@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ -static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; @@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { @@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) static bool array_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { - return meta0->max_entries == meta1->max_entries && - bpf_map_meta_equal(meta0, meta1); + if (!bpf_map_meta_equal(meta0, meta1)) + return false; + return meta0->map_flags & BPF_F_INNER_MAP ? true : + meta0->max_entries == meta1->max_entries; } struct bpf_iter_seq_array_map_info { @@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 array_of_map_gen_lookup(struct bpf_map *map, +static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3395cf140d22b..1815e97d4c9c1 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) * bpf_prog * __htab_map_lookup_elem */ -static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) return __htab_lru_map_lookup_elem(map, key, false); } -static u32 htab_lru_map_gen_lookup(struct bpf_map *map, +static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; @@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 htab_of_map_gen_lookup(struct bpf_map *map, +static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3e36eade3d42..fa5badc9279a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11049,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == -EOPNOTSUPP) + goto patch_map_ops_generic; + if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -11079,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); - +patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 0c5df593bc567..49da2b8ace8b7 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; struct bpf_insn *insn = insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b97bc5abb3b89..bf5a99d803e48 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -435,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ From 6775dab73bdc17df0f19d9f9d7470dc54536a909 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:04 +0200 Subject: [PATCH 4/6] bpf, selftests: Add test for different array inner map size Extend the "diff_size" subtest to also include a non-inlined array map variant where dynamic inner #elems are possible. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201010234006.7075-5-daniel@iogearbox.net --- .../selftests/bpf/prog_tests/btf_map_in_map.c | 39 ++++++++++++----- .../selftests/bpf/progs/test_btf_map_in_map.c | 43 +++++++++++++++++++ 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c index 540fea4c91a5c..76ebe4c250f11 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -55,10 +55,10 @@ static int kern_sync_rcu(void) static void test_lookup_update(void) { - int err, key = 0, val, i; + int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id; + int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd; struct test_btf_map_in_map *skel; - int outer_arr_fd, outer_hash_fd; - int fd, map1_fd, map2_fd, map1_id, map2_id; + int err, key = 0, val, i, fd; skel = test_btf_map_in_map__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) @@ -70,32 +70,45 @@ static void test_lookup_update(void) map1_fd = bpf_map__fd(skel->maps.inner_map1); map2_fd = bpf_map__fd(skel->maps.inner_map2); + map3_fd = bpf_map__fd(skel->maps.inner_map3); + map4_fd = bpf_map__fd(skel->maps.inner_map4); + map5_fd = bpf_map__fd(skel->maps.inner_map5); + outer_arr_dyn_fd = bpf_map__fd(skel->maps.outer_arr_dyn); outer_arr_fd = bpf_map__fd(skel->maps.outer_arr); outer_hash_fd = bpf_map__fd(skel->maps.outer_hash); - /* inner1 = input, inner2 = input + 1 */ - map1_fd = bpf_map__fd(skel->maps.inner_map1); + /* inner1 = input, inner2 = input + 1, inner3 = input + 2 */ bpf_map_update_elem(outer_arr_fd, &key, &map1_fd, 0); - map2_fd = bpf_map__fd(skel->maps.inner_map2); bpf_map_update_elem(outer_hash_fd, &key, &map2_fd, 0); + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map3_fd, 0); skel->bss->input = 1; usleep(1); - bpf_map_lookup_elem(map1_fd, &key, &val); CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1); bpf_map_lookup_elem(map2_fd, &key, &val); CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2); + bpf_map_lookup_elem(map3_fd, &key, &val); + CHECK(val != 3, "inner3", "got %d != exp %d\n", val, 3); - /* inner1 = input + 1, inner2 = input */ + /* inner2 = input, inner1 = input + 1, inner4 = input + 2 */ bpf_map_update_elem(outer_arr_fd, &key, &map2_fd, 0); bpf_map_update_elem(outer_hash_fd, &key, &map1_fd, 0); + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map4_fd, 0); skel->bss->input = 3; usleep(1); - bpf_map_lookup_elem(map1_fd, &key, &val); CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4); bpf_map_lookup_elem(map2_fd, &key, &val); CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3); + bpf_map_lookup_elem(map4_fd, &key, &val); + CHECK(val != 5, "inner4", "got %d != exp %d\n", val, 5); + + /* inner5 = input + 2 */ + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map5_fd, 0); + skel->bss->input = 5; + usleep(1); + bpf_map_lookup_elem(map5_fd, &key, &val); + CHECK(val != 7, "inner5", "got %d != exp %d\n", val, 7); for (i = 0; i < 5; i++) { val = i % 2 ? map1_fd : map2_fd; @@ -106,7 +119,13 @@ static void test_lookup_update(void) } err = bpf_map_update_elem(outer_arr_fd, &key, &val, 0); if (CHECK_FAIL(err)) { - printf("failed to update hash_of_maps on iter #%d\n", i); + printf("failed to update array_of_maps on iter #%d\n", i); + goto cleanup; + } + val = i % 2 ? map4_fd : map5_fd; + err = bpf_map_update_elem(outer_arr_dyn_fd, &key, &val, 0); + if (CHECK_FAIL(err)) { + printf("failed to update array_of_maps (dyn) on iter #%d\n", i); goto cleanup; } } diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c index 193fe0198b21a..c1e0c8c7c55ff 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c +++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c @@ -41,6 +41,43 @@ struct outer_arr { .values = { (void *)&inner_map1, 0, (void *)&inner_map2 }, }; +struct inner_map_sz3 { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_INNER_MAP); + __uint(max_entries, 3); + __type(key, int); + __type(value, int); +} inner_map3 SEC(".maps"), + inner_map4 SEC(".maps"); + +struct inner_map_sz4 { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_INNER_MAP); + __uint(max_entries, 5); + __type(key, int); + __type(value, int); +} inner_map5 SEC(".maps"); + +struct outer_arr_dyn { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 3); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_INNER_MAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); + }); +} outer_arr_dyn SEC(".maps") = { + .values = { + [0] = (void *)&inner_map3, + [1] = (void *)&inner_map4, + [2] = (void *)&inner_map5, + }, +}; + struct outer_hash { __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); __uint(max_entries, 5); @@ -101,6 +138,12 @@ int handle__sys_enter(void *ctx) val = input + 1; bpf_map_update_elem(inner_map, &key, &val, 0); + inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); + if (!inner_map) + return 1; + val = input + 2; + bpf_map_update_elem(inner_map, &key, &val, 0); + return 0; } From 57a73fe7c1988ae726de23ee7c3c49ca82221751 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:05 +0200 Subject: [PATCH 5/6] bpf, selftests: Make redirect_neigh test more extensible Rename into test_tc_redirect.sh and move setup and test code into separate functions so they can be reused for newly added tests in here. Also remove the crude hack to override ifindex inside the object file via xxd and sed and just use a simple map instead. Map given iproute2 does not support BTF fully and therefore neither global data at this point. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201010234006.7075-6-daniel@iogearbox.net --- .../selftests/bpf/progs/test_tc_neigh.c | 40 ++-- tools/testing/selftests/bpf/test_tc_neigh.sh | 168 --------------- .../testing/selftests/bpf/test_tc_redirect.sh | 197 ++++++++++++++++++ 3 files changed, 219 insertions(+), 186 deletions(-) delete mode 100755 tools/testing/selftests/bpf/test_tc_neigh.sh create mode 100755 tools/testing/selftests/bpf/test_tc_redirect.sh diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c index 889a72c3024fd..fe182616b112a 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c +++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c @@ -13,17 +13,10 @@ #include #include -#ifndef barrier_data -# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory") -#endif - #ifndef ctx_ptr # define ctx_ptr(field) (void *)(long)(field) #endif -#define dst_to_src_tmp 0xeeddddeeU -#define src_to_dst_tmp 0xeeffffeeU - #define ip4_src 0xac100164 /* 172.16.1.100 */ #define ip4_dst 0xac100264 /* 172.16.2.100 */ @@ -39,6 +32,18 @@ a.s6_addr32[3] == b.s6_addr32[3]) #endif +enum { + dev_src, + dev_dst, +}; + +struct bpf_map_def SEC("maps") ifindex_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 2, +}; + static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb, __be32 addr) { @@ -73,7 +78,14 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb, return v6_equal(ip6h->daddr, addr); } -SEC("chk_neigh") int tc_chk(struct __sk_buff *skb) +static __always_inline int get_dev_ifindex(int which) +{ + int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); + + return ifindex ? *ifindex : 0; +} + +SEC("chk_egress") int tc_chk(struct __sk_buff *skb) { void *data_end = ctx_ptr(skb->data_end); void *data = ctx_ptr(skb->data); @@ -87,7 +99,6 @@ SEC("chk_neigh") int tc_chk(struct __sk_buff *skb) SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) { - int idx = dst_to_src_tmp; __u8 zero[ETH_ALEN * 2]; bool redirect = false; @@ -103,19 +114,15 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) if (!redirect) return TC_ACT_OK; - barrier_data(&idx); - idx = bpf_ntohl(idx); - __builtin_memset(&zero, 0, sizeof(zero)); if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(idx, 0); + return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0); } SEC("src_ingress") int tc_src(struct __sk_buff *skb) { - int idx = src_to_dst_tmp; __u8 zero[ETH_ALEN * 2]; bool redirect = false; @@ -131,14 +138,11 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb) if (!redirect) return TC_ACT_OK; - barrier_data(&idx); - idx = bpf_ntohl(idx); - __builtin_memset(&zero, 0, sizeof(zero)); if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(idx, 0); + return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0); } char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_neigh.sh b/tools/testing/selftests/bpf/test_tc_neigh.sh deleted file mode 100755 index 31d8c3df8b242..0000000000000 --- a/tools/testing/selftests/bpf/test_tc_neigh.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link -# between src and dst. The netns fwd has veth links to each src and dst. The -# client is in src and server in dst. The test installs a TC BPF program to each -# host facing veth in fwd which calls into bpf_redirect_peer() to perform the -# neigh addr population and redirect; it also installs a dropper prog on the -# egress side to drop skbs if neigh addrs were not populated. - -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -# check that nc, dd, ping, ping6 and timeout are present -command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v dd >/dev/null 2>&1 || \ - { echo >&2 "dd is not available"; exit 1; } -command -v timeout >/dev/null 2>&1 || \ - { echo >&2 "timeout is not available"; exit 1; } -command -v ping >/dev/null 2>&1 || \ - { echo >&2 "ping is not available"; exit 1; } -command -v ping6 >/dev/null 2>&1 || \ - { echo >&2 "ping6 is not available"; exit 1; } - -readonly GREEN='\033[0;92m' -readonly RED='\033[0;31m' -readonly NC='\033[0m' # No Color - -readonly PING_ARG="-c 3 -w 10 -q" - -readonly TIMEOUT=10 - -readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" -readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" -readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" - -readonly IP4_SRC="172.16.1.100" -readonly IP4_DST="172.16.2.100" - -readonly IP6_SRC="::1:dead:beef:cafe" -readonly IP6_DST="::2:dead:beef:cafe" - -readonly IP4_SLL="169.254.0.1" -readonly IP4_DLL="169.254.0.2" -readonly IP4_NET="169.254.0.0" - -cleanup() -{ - ip netns del ${NS_SRC} - ip netns del ${NS_FWD} - ip netns del ${NS_DST} -} - -trap cleanup EXIT - -set -e - -ip netns add "${NS_SRC}" -ip netns add "${NS_FWD}" -ip netns add "${NS_DST}" - -ip link add veth_src type veth peer name veth_src_fwd -ip link add veth_dst type veth peer name veth_dst_fwd - -ip link set veth_src netns ${NS_SRC} -ip link set veth_src_fwd netns ${NS_FWD} - -ip link set veth_dst netns ${NS_DST} -ip link set veth_dst_fwd netns ${NS_FWD} - -ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src -ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst - -# The fwd netns automatically get a v6 LL address / routes, but also needs v4 -# one in order to start ARP probing. IP4_NET route is added to the endpoints -# so that the ARP processing will reply. - -ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd -ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd - -ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad -ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad - -ip -netns ${NS_SRC} link set dev veth_src up -ip -netns ${NS_FWD} link set dev veth_src_fwd up - -ip -netns ${NS_DST} link set dev veth_dst up -ip -netns ${NS_FWD} link set dev veth_dst_fwd up - -ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global -ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global -ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global - -ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global -ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global - -ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global -ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global -ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global - -ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global -ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global - -fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) -fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) - -ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src -ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst - -ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src -ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst - -veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}') -veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}') - -xxd -p < test_tc_neigh.o | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o -xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o - -ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact -ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress -ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh - -ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact -ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress -ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh - -rm -f test_tc_neigh.x.o test_tc_neigh.y.o - -ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" -ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" - -set +e - -TEST="TCPv4 connectivity test" -ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="TCPv6 connectivity test" -ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="ICMPv4 connectivity test" -ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="ICMPv6 connectivity test" -ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh new file mode 100755 index 0000000000000..6ad4414051323 --- /dev/null +++ b/tools/testing/selftests/bpf/test_tc_redirect.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link +# between src and dst. The netns fwd has veth links to each src and dst. The +# client is in src and server in dst. The test installs a TC BPF program to each +# host facing veth in fwd which calls into bpf_redirect_peer() to perform the +# neigh addr population and redirect; it also installs a dropper prog on the +# egress side to drop skbs if neigh addrs were not populated. + +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + echo "FAIL" + exit 1 +fi + +# check that needed tools are present +command -v nc >/dev/null 2>&1 || \ + { echo >&2 "nc is not available"; exit 1; } +command -v dd >/dev/null 2>&1 || \ + { echo >&2 "dd is not available"; exit 1; } +command -v timeout >/dev/null 2>&1 || \ + { echo >&2 "timeout is not available"; exit 1; } +command -v ping >/dev/null 2>&1 || \ + { echo >&2 "ping is not available"; exit 1; } +command -v ping6 >/dev/null 2>&1 || \ + { echo >&2 "ping6 is not available"; exit 1; } +command -v perl >/dev/null 2>&1 || \ + { echo >&2 "perl is not available"; exit 1; } +command -v jq >/dev/null 2>&1 || \ + { echo >&2 "jq is not available"; exit 1; } +command -v bpftool >/dev/null 2>&1 || \ + { echo >&2 "bpftool is not available"; exit 1; } + +readonly GREEN='\033[0;92m' +readonly RED='\033[0;31m' +readonly NC='\033[0m' # No Color + +readonly PING_ARG="-c 3 -w 10 -q" + +readonly TIMEOUT=10 + +readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" +readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" +readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" + +readonly IP4_SRC="172.16.1.100" +readonly IP4_DST="172.16.2.100" + +readonly IP6_SRC="::1:dead:beef:cafe" +readonly IP6_DST="::2:dead:beef:cafe" + +readonly IP4_SLL="169.254.0.1" +readonly IP4_DLL="169.254.0.2" +readonly IP4_NET="169.254.0.0" + +netns_cleanup() +{ + ip netns del ${NS_SRC} + ip netns del ${NS_FWD} + ip netns del ${NS_DST} +} + +netns_setup() +{ + ip netns add "${NS_SRC}" + ip netns add "${NS_FWD}" + ip netns add "${NS_DST}" + + ip link add veth_src type veth peer name veth_src_fwd + ip link add veth_dst type veth peer name veth_dst_fwd + + ip link set veth_src netns ${NS_SRC} + ip link set veth_src_fwd netns ${NS_FWD} + + ip link set veth_dst netns ${NS_DST} + ip link set veth_dst_fwd netns ${NS_FWD} + + ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src + ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst + + # The fwd netns automatically get a v6 LL address / routes, but also + # needs v4 one in order to start ARP probing. IP4_NET route is added + # to the endpoints so that the ARP processing will reply. + + ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd + ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd + + ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad + ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad + + ip -netns ${NS_SRC} link set dev veth_src up + ip -netns ${NS_FWD} link set dev veth_src_fwd up + + ip -netns ${NS_DST} link set dev veth_dst up + ip -netns ${NS_FWD} link set dev veth_dst_fwd up + + ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global + ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global + ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global + + ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global + ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global + + ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global + ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global + ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global + + ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global + ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global + + fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) + fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) + + ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src + ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst + + ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src + ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst +} + +netns_test_connectivity() +{ + set +e + + ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" + ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" + + TEST="TCPv4 connectivity test" + ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" + if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 + fi + echo -e "${TEST}: ${GREEN}PASS${NC}" + + TEST="TCPv6 connectivity test" + ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" + if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 + fi + echo -e "${TEST}: ${GREEN}PASS${NC}" + + TEST="ICMPv4 connectivity test" + ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} + if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 + fi + echo -e "${TEST}: ${GREEN}PASS${NC}" + + TEST="ICMPv6 connectivity test" + ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} + if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 + fi + echo -e "${TEST}: ${GREEN}PASS${NC}" + + set -e +} + +hex_mem_str() +{ + perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1 +} + +netns_setup_neigh() +{ + ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.o sec src_ingress + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.o sec chk_egress + + ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.o sec dst_ingress + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.o sec chk_egress + + veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex) + veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex) + + progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]') + for prog in $progs; do + map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]') + if [ ! -z "$map" ]; then + bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src) + bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst) + fi + done +} + +trap netns_cleanup EXIT +set -e + +netns_setup +netns_setup_neigh +netns_test_connectivity From 9f4c53ca23a28c891d2bd3ff4738f7d95ba0303b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:06 +0200 Subject: [PATCH 6/6] bpf, selftests: Add redirect_peer selftest Extend the test_tc_redirect test and add a small test that exercises the new redirect_peer() helper for the IPv4 and IPv6 case. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201010234006.7075-7-daniel@iogearbox.net --- .../selftests/bpf/progs/test_tc_peer.c | 45 +++++++++++++++++++ .../testing/selftests/bpf/test_tc_redirect.sh | 25 +++++++---- 2 files changed, 61 insertions(+), 9 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_tc_peer.c diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c new file mode 100644 index 0000000000000..fc84a7685aa2c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include +#include + +#include + +enum { + dev_src, + dev_dst, +}; + +struct bpf_map_def SEC("maps") ifindex_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 2, +}; + +static __always_inline int get_dev_ifindex(int which) +{ + int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); + + return ifindex ? *ifindex : 0; +} + +SEC("chk_egress") int tc_chk(struct __sk_buff *skb) +{ + return TC_ACT_SHOT; +} + +SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) +{ + return bpf_redirect_peer(get_dev_ifindex(dev_src), 0); +} + +SEC("src_ingress") int tc_src(struct __sk_buff *skb) +{ + return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0); +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh index 6ad4414051323..6d74825621401 100755 --- a/tools/testing/selftests/bpf/test_tc_redirect.sh +++ b/tools/testing/selftests/bpf/test_tc_redirect.sh @@ -4,9 +4,10 @@ # This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link # between src and dst. The netns fwd has veth links to each src and dst. The # client is in src and server in dst. The test installs a TC BPF program to each -# host facing veth in fwd which calls into bpf_redirect_peer() to perform the -# neigh addr population and redirect; it also installs a dropper prog on the -# egress side to drop skbs if neigh addrs were not populated. +# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the +# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace +# switch from ingress side; it also installs a checker prog on the egress side +# to drop unexpected traffic. if [[ $EUID -ne 0 ]]; then echo "This script must be run as root" @@ -166,15 +167,17 @@ hex_mem_str() perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1 } -netns_setup_neigh() +netns_setup_bpf() { + local obj=$1 + ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.o sec src_ingress - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.o sec chk_egress + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.o sec dst_ingress - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.o sec chk_egress + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex) veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex) @@ -193,5 +196,9 @@ trap netns_cleanup EXIT set -e netns_setup -netns_setup_neigh +netns_setup_bpf test_tc_neigh.o +netns_test_connectivity +netns_cleanup +netns_setup +netns_setup_bpf test_tc_peer.o netns_test_connectivity