diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index d7131a1afadf8..9ffa13b77dcf1 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -920,6 +920,13 @@ definitions: - name: l2 - name: l3 + - + name: netkit-scrub + type: enum + entries: + - name: none + - name: default + attribute-sets: - name: link-attrs @@ -2151,6 +2158,14 @@ attribute-sets: name: mode type: u32 enum: netkit-mode + - + name: scrub + type: u32 + enum: netkit-scrub + - + name: peer-scrub + type: u32 + enum: netkit-scrub sub-messages: - diff --git a/MAINTAINERS b/MAINTAINERS index 560a65b85297d..d678a58c02057 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16300,7 +16300,7 @@ F: include/net/mptcp.h F: include/trace/events/mptcp.h F: include/uapi/linux/mptcp*.h F: net/mptcp/ -F: tools/testing/selftests/bpf/*/*mptcp*.c +F: tools/testing/selftests/bpf/*/*mptcp*.[ch] F: tools/testing/selftests/net/mptcp/ NETWORKING [TCP] diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 059269557d926..cd8360b9bbde2 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -20,6 +20,7 @@ struct netkit { struct net_device __rcu *peer; struct bpf_mprog_entry __rcu *active; enum netkit_action policy; + enum netkit_scrub scrub; struct bpf_mprog_bundle bundle; /* Needed in slow-path */ @@ -50,12 +51,24 @@ netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, return ret; } -static void netkit_prep_forward(struct sk_buff *skb, bool xnet) +static void netkit_xnet(struct sk_buff *skb) { - skb_scrub_packet(skb, xnet); skb->priority = 0; + skb->mark = 0; +} + +static void netkit_prep_forward(struct sk_buff *skb, + bool xnet, bool xnet_scrub) +{ + skb_scrub_packet(skb, false); nf_skip_egress(skb, true); skb_reset_mac_header(skb); + if (!xnet) + return; + ipvs_reset(skb); + skb_clear_tstamp(skb); + if (xnet_scrub) + netkit_xnet(skb); } static struct netkit *netkit_priv(const struct net_device *dev) @@ -80,7 +93,8 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) !pskb_may_pull(skb, ETH_HLEN) || skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; - netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); + netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)), + nk->scrub); eth_skb_pkt_type(skb, peer); skb->dev = peer; entry = rcu_dereference(nk->active); @@ -297,20 +311,6 @@ static int netkit_check_policy(int policy, struct nlattr *tb, } } -static int netkit_check_mode(int mode, struct nlattr *tb, - struct netlink_ext_ack *extack) -{ - switch (mode) { - case NETKIT_L2: - case NETKIT_L3: - return 0; - default: - NL_SET_ERR_MSG_ATTR(extack, tb, - "Provided device mode can only be L2 or L3"); - return -EINVAL; - } -} - static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -332,8 +332,10 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr; - enum netkit_action default_prim = NETKIT_PASS; - enum netkit_action default_peer = NETKIT_PASS; + enum netkit_action policy_prim = NETKIT_PASS; + enum netkit_action policy_peer = NETKIT_PASS; + enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; + enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; enum netkit_mode mode = NETKIT_L3; unsigned char ifname_assign_type; struct ifinfomsg *ifmp = NULL; @@ -344,13 +346,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, int err; if (data) { - if (data[IFLA_NETKIT_MODE]) { - attr = data[IFLA_NETKIT_MODE]; - mode = nla_get_u32(attr); - err = netkit_check_mode(mode, attr, extack); - if (err < 0) - return err; - } + if (data[IFLA_NETKIT_MODE]) + mode = nla_get_u32(data[IFLA_NETKIT_MODE]); if (data[IFLA_NETKIT_PEER_INFO]) { attr = data[IFLA_NETKIT_PEER_INFO]; ifmp = nla_data(attr); @@ -362,17 +359,21 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, return err; tbp = peer_tb; } + if (data[IFLA_NETKIT_SCRUB]) + scrub_prim = nla_get_u32(data[IFLA_NETKIT_SCRUB]); + if (data[IFLA_NETKIT_PEER_SCRUB]) + scrub_peer = nla_get_u32(data[IFLA_NETKIT_PEER_SCRUB]); if (data[IFLA_NETKIT_POLICY]) { attr = data[IFLA_NETKIT_POLICY]; - default_prim = nla_get_u32(attr); - err = netkit_check_policy(default_prim, attr, extack); + policy_prim = nla_get_u32(attr); + err = netkit_check_policy(policy_prim, attr, extack); if (err < 0) return err; } if (data[IFLA_NETKIT_PEER_POLICY]) { attr = data[IFLA_NETKIT_PEER_POLICY]; - default_peer = nla_get_u32(attr); - err = netkit_check_policy(default_peer, attr, extack); + policy_peer = nla_get_u32(attr); + err = netkit_check_policy(policy_peer, attr, extack); if (err < 0) return err; } @@ -409,7 +410,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, nk = netkit_priv(peer); nk->primary = false; - nk->policy = default_peer; + nk->policy = policy_peer; + nk->scrub = scrub_peer; nk->mode = mode; bpf_mprog_bundle_init(&nk->bundle); @@ -434,7 +436,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, nk = netkit_priv(dev); nk->primary = true; - nk->policy = default_prim; + nk->policy = policy_prim; + nk->scrub = scrub_prim; nk->mode = mode; bpf_mprog_bundle_init(&nk->bundle); @@ -874,6 +877,18 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return -EACCES; } + if (data[IFLA_NETKIT_SCRUB]) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_SCRUB], + "netkit scrubbing cannot be changed after device creation"); + return -EACCES; + } + + if (data[IFLA_NETKIT_PEER_SCRUB]) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_SCRUB], + "netkit scrubbing cannot be changed after device creation"); + return -EACCES; + } + if (data[IFLA_NETKIT_PEER_INFO]) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO], "netkit peer info cannot be changed after device creation"); @@ -908,8 +923,10 @@ static size_t netkit_get_size(const struct net_device *dev) { return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */ - nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_SCRUB */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */ + nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ 0; } @@ -924,11 +941,15 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode)) return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub)) + return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy)) return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_PEER_SCRUB, nk->scrub)) + return -EMSGSIZE; } return 0; @@ -936,9 +957,11 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_PEER_INFO] = { .len = sizeof(struct ifinfomsg) }, + [IFLA_NETKIT_MODE] = NLA_POLICY_MAX(NLA_U32, NETKIT_L3), [IFLA_NETKIT_POLICY] = { .type = NLA_U32 }, - [IFLA_NETKIT_MODE] = { .type = NLA_U32 }, [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 }, + [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), + [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, .reject_message = "Primary attribute is read-only" }, }; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 0a5dca2b2b3f6..40085afd91607 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { - list_del(&pos->xskb_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { + list_del(&pos->list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, xskb_list_node); + struct xdp_buff_xsk, list_node); if (frag) { - list_del(&frag->xskb_list_node); + list_del(&frag->list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->xskb_list_node); + list_del(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - xskb_list_node); + list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bacb33f1e3e58..bb03cee716b31 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -28,9 +28,7 @@ struct xdp_buff_xsk { dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; - u64 orig_addr; - struct list_head free_list_node; - struct list_head xskb_list_node; + struct list_head list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) @@ -78,6 +76,7 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u32 xdp_zc_max_segs; u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; @@ -120,7 +119,6 @@ void xp_free(struct xdp_buff_xsk *xskb); static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, u64 addr) { - xskb->orig_addr = addr; xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; } @@ -222,14 +220,19 @@ static inline void xp_release(struct xdp_buff_xsk *xskb) xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; } -static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) +static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, + struct xsk_buff_pool *pool) { - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + u64 orig_addr = xskb->xdp.data - pool->addrs; + u64 offset; - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + if (!pool->unaligned) + return orig_addr; + + offset = xskb->xdp.data - xskb->xdp.data_hard_start; + orig_addr -= offset; + offset += pool->headroom; + return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 506ba9c80e83a..8516c1ccd57a7 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1293,6 +1293,19 @@ enum netkit_mode { NETKIT_L3, }; +/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to + * the BPF program if attached. This also means the latter can + * consume the two fields if they were populated earlier. + * + * NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before + * invoking the attached BPF program when the peer device resides + * in a different network namespace. This is the default behavior. + */ +enum netkit_scrub { + NETKIT_SCRUB_NONE, + NETKIT_SCRUB_DEFAULT, +}; + enum { IFLA_NETKIT_UNSPEC, IFLA_NETKIT_PEER_INFO, @@ -1300,6 +1313,8 @@ enum { IFLA_NETKIT_POLICY, IFLA_NETKIT_PEER_POLICY, IFLA_NETKIT_MODE, + IFLA_NETKIT_SCRUB, + IFLA_NETKIT_PEER_SCRUB, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) diff --git a/net/core/filter.c b/net/core/filter.c index 202c1d386e195..a88e6924c4c06 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5138,6 +5138,17 @@ static u64 __bpf_get_netns_cookie(struct sock *sk) return net->net_cookie; } +BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) +{ + return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_proto = { + .func = bpf_get_netns_cookie, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); @@ -8205,6 +8216,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: @@ -10237,10 +10250,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, } \ } while (0) -#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ - S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) - static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1140b2a120cae..7d7e37f53708c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -141,7 +141,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, u64 addr; int err; - addr = xp_get_handle(xskb); + addr = xp_get_handle(xskb, xskb->pool); err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; @@ -171,14 +171,14 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; xskb_list = &xskb->pool->xskb_list; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->xskb_list_node); + list_del(&pos->list_node); } return 0; @@ -527,34 +527,34 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) { unsigned long flags; int ret; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - ret = xskq_prod_reserve_addr(xs->pool->cq, addr); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + spin_lock_irqsave(&pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(pool->cq, addr); + spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } -static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) +static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_submit_n(xs->pool->cq, n); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + spin_lock_irqsave(&pool->cq_lock, flags); + xskq_prod_submit_n(pool->cq, n); + spin_unlock_irqrestore(&pool->cq_lock, flags); } -static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) +static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel_n(xs->pool->cq, n); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + spin_lock_irqsave(&pool->cq_lock, flags); + xskq_prod_cancel_n(pool->cq, n); + spin_unlock_irqrestore(&pool->cq_lock, flags); } static u32 xsk_get_num_desc(struct sk_buff *skb) @@ -571,7 +571,7 @@ static void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); + xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); sock_wfree(skb); } @@ -587,7 +587,7 @@ static void xsk_consume_skb(struct sk_buff *skb) struct xdp_sock *xs = xdp_sk(skb->sk); skb->destructor = sock_wfree; - xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; @@ -765,7 +765,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, xskq_cons_release(xs->tx); } else { /* Let application retry */ - xsk_cq_cancel_locked(xs, 1); + xsk_cq_cancel_locked(xs->pool, 1); } return ERR_PTR(err); @@ -802,7 +802,7 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (xsk_cq_reserve_addr_locked(xs, desc.addr)) + if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr)) goto out; skb = xsk_build_skb(xs, &desc); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 521a2938e50a1..ae71da7d2cd6a 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -101,8 +101,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb = &pool->heads[i]; xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; - INIT_LIST_HEAD(&xskb->free_list_node); - INIT_LIST_HEAD(&xskb->xskb_list_node); + INIT_LIST_HEAD(&xskb->list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -230,6 +229,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_xsk; } pool->umem->zc = true; + pool->xdp_zc_max_segs = netdev->xdp_zc_max_segs; return 0; err_unreg_xsk: @@ -417,8 +417,10 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ for (i = 0; i < pool->heads_cnt; i++) { struct xdp_buff_xsk *xskb = &pool->heads[i]; + u64 orig_addr; - xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + orig_addr = xskb->xdp.data_hard_start - pool->addrs - pool->headroom; + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, orig_addr); } } @@ -501,6 +503,22 @@ static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) return *addr < pool->addrs_cnt; } +static struct xdp_buff_xsk *xp_get_xskb(struct xsk_buff_pool *pool, u64 addr) +{ + struct xdp_buff_xsk *xskb; + + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; + } + + return xskb; +} + static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb; @@ -526,14 +544,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) break; } - if (pool->unaligned) { - xskb = pool->free_heads[--pool->free_heads_cnt]; - xp_init_xskb_addr(xskb, pool, addr); - if (pool->dma_pages) - xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); - } else { - xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; - } + xskb = xp_get_xskb(pool, addr); xskq_cons_release(pool->fq); return xskb; @@ -550,8 +561,8 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) } else { pool->free_list_cnt--; xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, - free_list_node); - list_del_init(&xskb->free_list_node); + list_node); + list_del_init(&xskb->list_node); } xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; @@ -591,14 +602,7 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd continue; } - if (pool->unaligned) { - xskb = pool->free_heads[--pool->free_heads_cnt]; - xp_init_xskb_addr(xskb, pool, addr); - if (pool->dma_pages) - xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); - } else { - xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; - } + xskb = xp_get_xskb(pool, addr); *xdp = &xskb->xdp; xdp++; @@ -617,8 +621,8 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3 i = nb_entries; while (i--) { - xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); - list_del_init(&xskb->free_list_node); + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, list_node); + list_del_init(&xskb->list_node); *xdp = &xskb->xdp; xdp++; @@ -688,11 +692,11 @@ EXPORT_SYMBOL(xp_can_alloc); void xp_free(struct xdp_buff_xsk *xskb) { - if (!list_empty(&xskb->free_list_node)) + if (!list_empty(&xskb->list_node)) return; xskb->pool->free_list_cnt++; - list_add(&xskb->free_list_node, &xskb->pool->free_list); + list_add(&xskb->list_node, &xskb->pool->free_list); } EXPORT_SYMBOL(xp_free); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 406b20dfee8d4..46d87e961ad6d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -260,7 +260,7 @@ u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, nr_frags = 0; } else { nr_frags++; - if (nr_frags == pool->netdev->xdp_zc_max_segs) { + if (nr_frags == pool->xdp_zc_max_segs) { nr_frags = 0; break; } diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 96ec2b01e725b..8516c1ccd57a7 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -462,6 +462,286 @@ enum in6_addr_gen_mode { /* Bridge section */ +/** + * DOC: Bridge enum definition + * + * Please *note* that the timer values in the following section are expected + * in clock_t format, which is seconds multiplied by USER_HZ (generally + * defined as 100). + * + * @IFLA_BR_FORWARD_DELAY + * The bridge forwarding delay is the time spent in LISTENING state + * (before moving to LEARNING) and in LEARNING state (before moving + * to FORWARDING). Only relevant if STP is enabled. + * + * The valid values are between (2 * USER_HZ) and (30 * USER_HZ). + * The default value is (15 * USER_HZ). + * + * @IFLA_BR_HELLO_TIME + * The time between hello packets sent by the bridge, when it is a root + * bridge or a designated bridge. Only relevant if STP is enabled. + * + * The valid values are between (1 * USER_HZ) and (10 * USER_HZ). + * The default value is (2 * USER_HZ). + * + * @IFLA_BR_MAX_AGE + * The hello packet timeout is the time until another bridge in the + * spanning tree is assumed to be dead, after reception of its last hello + * message. Only relevant if STP is enabled. + * + * The valid values are between (6 * USER_HZ) and (40 * USER_HZ). + * The default value is (20 * USER_HZ). + * + * @IFLA_BR_AGEING_TIME + * Configure the bridge's FDB entries aging time. It is the time a MAC + * address will be kept in the FDB after a packet has been received from + * that address. After this time has passed, entries are cleaned up. + * Allow values outside the 802.1 standard specification for special cases: + * + * * 0 - entry never ages (all permanent) + * * 1 - entry disappears (no persistence) + * + * The default value is (300 * USER_HZ). + * + * @IFLA_BR_STP_STATE + * Turn spanning tree protocol on (*IFLA_BR_STP_STATE* > 0) or off + * (*IFLA_BR_STP_STATE* == 0) for this bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_PRIORITY + * Set this bridge's spanning tree priority, used during STP root bridge + * election. + * + * The valid values are between 0 and 65535. + * + * @IFLA_BR_VLAN_FILTERING + * Turn VLAN filtering on (*IFLA_BR_VLAN_FILTERING* > 0) or off + * (*IFLA_BR_VLAN_FILTERING* == 0). When disabled, the bridge will not + * consider the VLAN tag when handling packets. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_VLAN_PROTOCOL + * Set the protocol used for VLAN filtering. + * + * The valid values are 0x8100(802.1Q) or 0x88A8(802.1AD). The default value + * is 0x8100(802.1Q). + * + * @IFLA_BR_GROUP_FWD_MASK + * The group forwarding mask. This is the bitmask that is applied to + * decide whether to forward incoming frames destined to link-local + * addresses (of the form 01:80:C2:00:00:0X). + * + * The default value is 0, which means the bridge does not forward any + * link-local frames coming on this port. + * + * @IFLA_BR_ROOT_ID + * The bridge root id, read only. + * + * @IFLA_BR_BRIDGE_ID + * The bridge id, read only. + * + * @IFLA_BR_ROOT_PORT + * The bridge root port, read only. + * + * @IFLA_BR_ROOT_PATH_COST + * The bridge root path cost, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE + * The bridge topology change, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE_DETECTED + * The bridge topology change detected, read only. + * + * @IFLA_BR_HELLO_TIMER + * The bridge hello timer, read only. + * + * @IFLA_BR_TCN_TIMER + * The bridge tcn timer, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE_TIMER + * The bridge topology change timer, read only. + * + * @IFLA_BR_GC_TIMER + * The bridge gc timer, read only. + * + * @IFLA_BR_GROUP_ADDR + * Set the MAC address of the multicast group this bridge uses for STP. + * The address must be a link-local address in standard Ethernet MAC address + * format. It is an address of the form 01:80:C2:00:00:0X, with X in [0, 4..f]. + * + * The default value is 0. + * + * @IFLA_BR_FDB_FLUSH + * Flush bridge's fdb dynamic entries. + * + * @IFLA_BR_MCAST_ROUTER + * Set bridge's multicast router if IGMP snooping is enabled. + * The valid values are: + * + * * 0 - disabled. + * * 1 - automatic (queried). + * * 2 - permanently enabled. + * + * The default value is 1. + * + * @IFLA_BR_MCAST_SNOOPING + * Turn multicast snooping on (*IFLA_BR_MCAST_SNOOPING* > 0) or off + * (*IFLA_BR_MCAST_SNOOPING* == 0). + * + * The default value is 1. + * + * @IFLA_BR_MCAST_QUERY_USE_IFADDR + * If enabled use the bridge's own IP address as source address for IGMP + * queries (*IFLA_BR_MCAST_QUERY_USE_IFADDR* > 0) or the default of 0.0.0.0 + * (*IFLA_BR_MCAST_QUERY_USE_IFADDR* == 0). + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_QUERIER + * Enable (*IFLA_BR_MULTICAST_QUERIER* > 0) or disable + * (*IFLA_BR_MULTICAST_QUERIER* == 0) IGMP querier, ie sending of multicast + * queries by the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_HASH_ELASTICITY + * Set multicast database hash elasticity, It is the maximum chain length in + * the multicast hash table. This attribute is *deprecated* and the value + * is always 16. + * + * @IFLA_BR_MCAST_HASH_MAX + * Set maximum size of the multicast hash table + * + * The default value is 4096, the value must be a power of 2. + * + * @IFLA_BR_MCAST_LAST_MEMBER_CNT + * The Last Member Query Count is the number of Group-Specific Queries + * sent before the router assumes there are no local members. The Last + * Member Query Count is also the number of Group-and-Source-Specific + * Queries sent before the router assumes there are no listeners for a + * particular source. + * + * The default value is 2. + * + * @IFLA_BR_MCAST_STARTUP_QUERY_CNT + * The Startup Query Count is the number of Queries sent out on startup, + * separated by the Startup Query Interval. + * + * The default value is 2. + * + * @IFLA_BR_MCAST_LAST_MEMBER_INTVL + * The Last Member Query Interval is the Max Response Time inserted into + * Group-Specific Queries sent in response to Leave Group messages, and + * is also the amount of time between Group-Specific Query messages. + * + * The default value is (1 * USER_HZ). + * + * @IFLA_BR_MCAST_MEMBERSHIP_INTVL + * The interval after which the bridge will leave a group, if no membership + * reports for this group are received. + * + * The default value is (260 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERIER_INTVL + * The interval between queries sent by other routers. if no queries are + * seen after this delay has passed, the bridge will start to send its own + * queries (as if *IFLA_BR_MCAST_QUERIER_INTVL* was enabled). + * + * The default value is (255 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERY_INTVL + * The Query Interval is the interval between General Queries sent by + * the Querier. + * + * The default value is (125 * USER_HZ). The minimum value is (1 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERY_RESPONSE_INTVL + * The Max Response Time used to calculate the Max Resp Code inserted + * into the periodic General Queries. + * + * The default value is (10 * USER_HZ). + * + * @IFLA_BR_MCAST_STARTUP_QUERY_INTVL + * The interval between queries in the startup phase. + * + * The default value is (125 * USER_HZ) / 4. The minimum value is (1 * USER_HZ). + * + * @IFLA_BR_NF_CALL_IPTABLES + * Enable (*NF_CALL_IPTABLES* > 0) or disable (*NF_CALL_IPTABLES* == 0) + * iptables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_NF_CALL_IP6TABLES + * Enable (*NF_CALL_IP6TABLES* > 0) or disable (*NF_CALL_IP6TABLES* == 0) + * ip6tables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_NF_CALL_ARPTABLES + * Enable (*NF_CALL_ARPTABLES* > 0) or disable (*NF_CALL_ARPTABLES* == 0) + * arptables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_VLAN_DEFAULT_PVID + * VLAN ID applied to untagged and priority-tagged incoming packets. + * + * The default value is 1. Setting to the special value 0 makes all ports of + * this bridge not have a PVID by default, which means that they will + * not accept VLAN-untagged traffic. + * + * @IFLA_BR_PAD + * Bridge attribute padding type for netlink message. + * + * @IFLA_BR_VLAN_STATS_ENABLED + * Enable (*IFLA_BR_VLAN_STATS_ENABLED* == 1) or disable + * (*IFLA_BR_VLAN_STATS_ENABLED* == 0) per-VLAN stats accounting. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_STATS_ENABLED + * Enable (*IFLA_BR_MCAST_STATS_ENABLED* > 0) or disable + * (*IFLA_BR_MCAST_STATS_ENABLED* == 0) multicast (IGMP/MLD) stats + * accounting. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_IGMP_VERSION + * Set the IGMP version. + * + * The valid values are 2 and 3. The default value is 2. + * + * @IFLA_BR_MCAST_MLD_VERSION + * Set the MLD version. + * + * The valid values are 1 and 2. The default value is 1. + * + * @IFLA_BR_VLAN_STATS_PER_PORT + * Enable (*IFLA_BR_VLAN_STATS_PER_PORT* == 1) or disable + * (*IFLA_BR_VLAN_STATS_PER_PORT* == 0) per-VLAN per-port stats accounting. + * Can be changed only when there are no port VLANs configured. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MULTI_BOOLOPT + * The multi_boolopt is used to control new boolean options to avoid adding + * new netlink attributes. You can look at ``enum br_boolopt_id`` for those + * options. + * + * @IFLA_BR_MCAST_QUERIER_STATE + * Bridge mcast querier states, read only. + * + * @IFLA_BR_FDB_N_LEARNED + * The number of dynamically learned FDB entries for the current bridge, + * read only. + * + * @IFLA_BR_FDB_MAX_LEARNED + * Set the number of max dynamically learned FDB entries for the current + * bridge. + */ enum { IFLA_BR_UNSPEC, IFLA_BR_FORWARD_DELAY, @@ -511,6 +791,8 @@ enum { IFLA_BR_VLAN_STATS_PER_PORT, IFLA_BR_MULTI_BOOLOPT, IFLA_BR_MCAST_QUERIER_STATE, + IFLA_BR_FDB_N_LEARNED, + IFLA_BR_FDB_MAX_LEARNED, __IFLA_BR_MAX, }; @@ -521,11 +803,252 @@ struct ifla_bridge_id { __u8 addr[6]; /* ETH_ALEN */ }; +/** + * DOC: Bridge mode enum definition + * + * @BRIDGE_MODE_HAIRPIN + * Controls whether traffic may be sent back out of the port on which it + * was received. This option is also called reflective relay mode, and is + * used to support basic VEPA (Virtual Ethernet Port Aggregator) + * capabilities. By default, this flag is turned off and the bridge will + * not forward traffic back out of the receiving port. + */ enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, }; +/** + * DOC: Bridge port enum definition + * + * @IFLA_BRPORT_STATE + * The operation state of the port. Here are the valid values. + * + * * 0 - port is in STP *DISABLED* state. Make this port completely + * inactive for STP. This is also called BPDU filter and could be used + * to disable STP on an untrusted port, like a leaf virtual device. + * The traffic forwarding is also stopped on this port. + * * 1 - port is in STP *LISTENING* state. Only valid if STP is enabled + * on the bridge. In this state the port listens for STP BPDUs and + * drops all other traffic frames. + * * 2 - port is in STP *LEARNING* state. Only valid if STP is enabled on + * the bridge. In this state the port will accept traffic only for the + * purpose of updating MAC address tables. + * * 3 - port is in STP *FORWARDING* state. Port is fully active. + * * 4 - port is in STP *BLOCKING* state. Only valid if STP is enabled on + * the bridge. This state is used during the STP election process. + * In this state, port will only process STP BPDUs. + * + * @IFLA_BRPORT_PRIORITY + * The STP port priority. The valid values are between 0 and 255. + * + * @IFLA_BRPORT_COST + * The STP path cost of the port. The valid values are between 1 and 65535. + * + * @IFLA_BRPORT_MODE + * Set the bridge port mode. See *BRIDGE_MODE_HAIRPIN* for more details. + * + * @IFLA_BRPORT_GUARD + * Controls whether STP BPDUs will be processed by the bridge port. By + * default, the flag is turned off to allow BPDU processing. Turning this + * flag on will disable the bridge port if a STP BPDU packet is received. + * + * If the bridge has Spanning Tree enabled, hostile devices on the network + * may send BPDU on a port and cause network failure. Setting *guard on* + * will detect and stop this by disabling the port. The port will be + * restarted if the link is brought down, or removed and reattached. + * + * @IFLA_BRPORT_PROTECT + * Controls whether a given port is allowed to become a root port or not. + * Only used when STP is enabled on the bridge. By default the flag is off. + * + * This feature is also called root port guard. If BPDU is received from a + * leaf (edge) port, it should not be elected as root port. This could + * be used if using STP on a bridge and the downstream bridges are not fully + * trusted; this prevents a hostile guest from rerouting traffic. + * + * @IFLA_BRPORT_FAST_LEAVE + * This flag allows the bridge to immediately stop multicast traffic + * forwarding on a port that receives an IGMP Leave message. It is only used + * when IGMP snooping is enabled on the bridge. By default the flag is off. + * + * @IFLA_BRPORT_LEARNING + * Controls whether a given port will learn *source* MAC addresses from + * received traffic or not. Also controls whether dynamic FDB entries + * (which can also be added by software) will be refreshed by incoming + * traffic. By default this flag is on. + * + * @IFLA_BRPORT_UNICAST_FLOOD + * Controls whether unicast traffic for which there is no FDB entry will + * be flooded towards this port. By default this flag is on. + * + * @IFLA_BRPORT_PROXYARP + * Enable proxy ARP on this port. + * + * @IFLA_BRPORT_LEARNING_SYNC + * Controls whether a given port will sync MAC addresses learned on device + * port to bridge FDB. + * + * @IFLA_BRPORT_PROXYARP_WIFI + * Enable proxy ARP on this port which meets extended requirements by + * IEEE 802.11 and Hotspot 2.0 specifications. + * + * @IFLA_BRPORT_ROOT_ID + * + * @IFLA_BRPORT_BRIDGE_ID + * + * @IFLA_BRPORT_DESIGNATED_PORT + * + * @IFLA_BRPORT_DESIGNATED_COST + * + * @IFLA_BRPORT_ID + * + * @IFLA_BRPORT_NO + * + * @IFLA_BRPORT_TOPOLOGY_CHANGE_ACK + * + * @IFLA_BRPORT_CONFIG_PENDING + * + * @IFLA_BRPORT_MESSAGE_AGE_TIMER + * + * @IFLA_BRPORT_FORWARD_DELAY_TIMER + * + * @IFLA_BRPORT_HOLD_TIMER + * + * @IFLA_BRPORT_FLUSH + * Flush bridge ports' fdb dynamic entries. + * + * @IFLA_BRPORT_MULTICAST_ROUTER + * Configure the port's multicast router presence. A port with + * a multicast router will receive all multicast traffic. + * The valid values are: + * + * * 0 disable multicast routers on this port + * * 1 let the system detect the presence of routers (default) + * * 2 permanently enable multicast traffic forwarding on this port + * * 3 enable multicast routers temporarily on this port, not depending + * on incoming queries. + * + * @IFLA_BRPORT_PAD + * + * @IFLA_BRPORT_MCAST_FLOOD + * Controls whether a given port will flood multicast traffic for which + * there is no MDB entry. By default this flag is on. + * + * @IFLA_BRPORT_MCAST_TO_UCAST + * Controls whether a given port will replicate packets using unicast + * instead of multicast. By default this flag is off. + * + * This is done by copying the packet per host and changing the multicast + * destination MAC to a unicast one accordingly. + * + * *mcast_to_unicast* works on top of the multicast snooping feature of the + * bridge. Which means unicast copies are only delivered to hosts which + * are interested in unicast and signaled this via IGMP/MLD reports previously. + * + * This feature is intended for interface types which have a more reliable + * and/or efficient way to deliver unicast packets than broadcast ones + * (e.g. WiFi). + * + * However, it should only be enabled on interfaces where no IGMPv2/MLDv1 + * report suppression takes place. IGMP/MLD report suppression issue is + * usually overcome by the network daemon (supplicant) enabling AP isolation + * and by that separating all STAs. + * + * Delivery of STA-to-STA IP multicast is made possible again by enabling + * and utilizing the bridge hairpin mode, which considers the incoming port + * as a potential outgoing port, too (see *BRIDGE_MODE_HAIRPIN* option). + * Hairpin mode is performed after multicast snooping, therefore leading + * to only deliver reports to STAs running a multicast router. + * + * @IFLA_BRPORT_VLAN_TUNNEL + * Controls whether vlan to tunnel mapping is enabled on the port. + * By default this flag is off. + * + * @IFLA_BRPORT_BCAST_FLOOD + * Controls flooding of broadcast traffic on the given port. By default + * this flag is on. + * + * @IFLA_BRPORT_GROUP_FWD_MASK + * Set the group forward mask. This is a bitmask that is applied to + * decide whether to forward incoming frames destined to link-local + * addresses. The addresses of the form are 01:80:C2:00:00:0X (defaults + * to 0, which means the bridge does not forward any link-local frames + * coming on this port). + * + * @IFLA_BRPORT_NEIGH_SUPPRESS + * Controls whether neighbor discovery (arp and nd) proxy and suppression + * is enabled on the port. By default this flag is off. + * + * @IFLA_BRPORT_ISOLATED + * Controls whether a given port will be isolated, which means it will be + * able to communicate with non-isolated ports only. By default this + * flag is off. + * + * @IFLA_BRPORT_BACKUP_PORT + * Set a backup port. If the port loses carrier all traffic will be + * redirected to the configured backup port. Set the value to 0 to disable + * it. + * + * @IFLA_BRPORT_MRP_RING_OPEN + * + * @IFLA_BRPORT_MRP_IN_OPEN + * + * @IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + * The number of per-port EHT hosts limit. The default value is 512. + * Setting to 0 is not allowed. + * + * @IFLA_BRPORT_MCAST_EHT_HOSTS_CNT + * The current number of tracked hosts, read only. + * + * @IFLA_BRPORT_LOCKED + * Controls whether a port will be locked, meaning that hosts behind the + * port will not be able to communicate through the port unless an FDB + * entry with the unit's MAC address is in the FDB. The common use case is + * that hosts are allowed access through authentication with the IEEE 802.1X + * protocol or based on whitelists. By default this flag is off. + * + * Please note that secure 802.1X deployments should always use the + * *BR_BOOLOPT_NO_LL_LEARN* flag, to not permit the bridge to populate its + * FDB based on link-local (EAPOL) traffic received on the port. + * + * @IFLA_BRPORT_MAB + * Controls whether a port will use MAC Authentication Bypass (MAB), a + * technique through which select MAC addresses may be allowed on a locked + * port, without using 802.1X authentication. Packets with an unknown source + * MAC address generates a "locked" FDB entry on the incoming bridge port. + * The common use case is for user space to react to these bridge FDB + * notifications and optionally replace the locked FDB entry with a normal + * one, allowing traffic to pass for whitelisted MAC addresses. + * + * Setting this flag also requires *IFLA_BRPORT_LOCKED* and + * *IFLA_BRPORT_LEARNING*. *IFLA_BRPORT_LOCKED* ensures that unauthorized + * data packets are dropped, and *IFLA_BRPORT_LEARNING* allows the dynamic + * FDB entries installed by user space (as replacements for the locked FDB + * entries) to be refreshed and/or aged out. + * + * @IFLA_BRPORT_MCAST_N_GROUPS + * + * @IFLA_BRPORT_MCAST_MAX_GROUPS + * Sets the maximum number of MDB entries that can be registered for a + * given port. Attempts to register more MDB entries at the port than this + * limit allows will be rejected, whether they are done through netlink + * (e.g. the bridge tool), or IGMP or MLD membership reports. Setting a + * limit of 0 disables the limit. The default value is 0. + * + * @IFLA_BRPORT_NEIGH_VLAN_SUPPRESS + * Controls whether neighbor discovery (arp and nd) proxy and suppression is + * enabled for a given port. By default this flag is off. + * + * Note that this option only takes effect when *IFLA_BRPORT_NEIGH_SUPPRESS* + * is enabled for a given port. + * + * @IFLA_BRPORT_BACKUP_NHID + * The FDB nexthop object ID to attach to packets being redirected to a + * backup port that has VLAN tunnel mapping enabled (via the + * *IFLA_BRPORT_VLAN_TUNNEL* option). Setting a value of 0 (default) has + * the effect of not attaching any ID. + */ enum { IFLA_BRPORT_UNSPEC, IFLA_BRPORT_STATE, /* Spanning tree state */ @@ -770,6 +1293,19 @@ enum netkit_mode { NETKIT_L3, }; +/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to + * the BPF program if attached. This also means the latter can + * consume the two fields if they were populated earlier. + * + * NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before + * invoking the attached BPF program when the peer device resides + * in a different network namespace. This is the default behavior. + */ +enum netkit_scrub { + NETKIT_SCRUB_NONE, + NETKIT_SCRUB_DEFAULT, +}; + enum { IFLA_NETKIT_UNSPEC, IFLA_NETKIT_PEER_INFO, @@ -777,6 +1313,8 @@ enum { IFLA_NETKIT_POLICY, IFLA_NETKIT_PEER_POLICY, IFLA_NETKIT_MODE, + IFLA_NETKIT_SCRUB, + IFLA_NETKIT_PEER_SCRUB, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) @@ -855,6 +1393,7 @@ enum { IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, + IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -872,6 +1411,13 @@ enum ifla_vxlan_df { VXLAN_DF_MAX = __VXLAN_DF_END - 1, }; +enum ifla_vxlan_label_policy { + VXLAN_LABEL_FIXED = 0, + VXLAN_LABEL_INHERIT = 1, + __VXLAN_LABEL_END, + VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1, +}; + /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, @@ -936,6 +1482,8 @@ enum { IFLA_GTP_ROLE, IFLA_GTP_CREATE_SOCKETS, IFLA_GTP_RESTART_COUNT, + IFLA_GTP_LOCAL, + IFLA_GTP_LOCAL6, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) @@ -1241,6 +1789,7 @@ enum { IFLA_HSR_PROTOCOL, /* Indicate different protocol than * HSR. For example PRP. */ + IFLA_HSR_INTERLINK, /* HSR interlink network device */ __IFLA_HSR_MAX, }; @@ -1418,7 +1967,9 @@ enum { enum { IFLA_DSA_UNSPEC, - IFLA_DSA_MASTER, + IFLA_DSA_CONDUIT, + /* Deprecated, use IFLA_DSA_CONDUIT instead */ + IFLA_DSA_MASTER = IFLA_DSA_CONDUIT, __IFLA_DSA_MAX, }; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index c72c16e1aff82..5764155b6d251 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NETWORK_HELPERS_H #define __NETWORK_HELPERS_H +#include #include #include #include diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index d2ca32fa3b21e..be3cad2aff774 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -5,12 +5,17 @@ #include #include #include +#include #include "cgroup_helpers.h" #include "network_helpers.h" #include "mptcp_sock.skel.h" #include "mptcpify.skel.h" +#include "mptcp_subflow.skel.h" #define NS_TEST "mptcp_ns" +#define ADDR_1 "10.0.1.1" +#define ADDR_2 "10.0.1.2" +#define PORT_1 10001 #ifndef IPPROTO_MPTCP #define IPPROTO_MPTCP 262 @@ -335,10 +340,126 @@ static void test_mptcpify(void) close(cgroup_fd); } +static int endpoint_init(char *flags) +{ + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", NS_TEST); + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, ADDR_2, flags)) { + printf("'ip mptcp' not supported, skip this test.\n"); + test__skip(); + goto fail; + } + + return 0; +fail: + return -1; +} + +static void wait_for_new_subflows(int fd) +{ + socklen_t len; + u8 subflows; + int err, i; + + len = sizeof(subflows); + /* Wait max 5 sec for new subflows to be created */ + for (i = 0; i < 50; i++) { + err = getsockopt(fd, SOL_MPTCP, MPTCP_INFO, &subflows, &len); + if (!err && subflows > 0) + break; + + usleep(100000); /* 0.1s */ + } +} + +static void run_subflow(void) +{ + int server_fd, client_fd, err; + char new[TCP_CA_NAME_MAX]; + char cc[TCP_CA_NAME_MAX]; + unsigned int mark; + socklen_t len; + + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); + if (!ASSERT_OK_FD(server_fd, "start_mptcp_server")) + return; + + client_fd = connect_to_fd(server_fd, 0); + if (!ASSERT_OK_FD(client_fd, "connect_to_fd")) + goto close_server; + + send_byte(client_fd); + wait_for_new_subflows(client_fd); + + len = sizeof(mark); + err = getsockopt(client_fd, SOL_SOCKET, SO_MARK, &mark, &len); + if (ASSERT_OK(err, "getsockopt(client_fd, SO_MARK)")) + ASSERT_EQ(mark, 0, "mark"); + + len = sizeof(new); + err = getsockopt(client_fd, SOL_TCP, TCP_CONGESTION, new, &len); + if (ASSERT_OK(err, "getsockopt(client_fd, TCP_CONGESTION)")) { + get_msk_ca_name(cc); + ASSERT_STREQ(new, cc, "cc"); + } + + close(client_fd); +close_server: + close(server_fd); +} + +static void test_subflow(void) +{ + struct mptcp_subflow *skel; + struct nstoken *nstoken; + int cgroup_fd; + + cgroup_fd = test__join_cgroup("/mptcp_subflow"); + if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_subflow")) + return; + + skel = mptcp_subflow__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_subflow")) + goto close_cgroup; + + skel->bss->pid = getpid(); + + skel->links.mptcp_subflow = + bpf_program__attach_cgroup(skel->progs.mptcp_subflow, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.mptcp_subflow, "attach mptcp_subflow")) + goto skel_destroy; + + skel->links._getsockopt_subflow = + bpf_program__attach_cgroup(skel->progs._getsockopt_subflow, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt_subflow, "attach _getsockopt_subflow")) + goto skel_destroy; + + nstoken = create_netns(); + if (!ASSERT_OK_PTR(nstoken, "create_netns: mptcp_subflow")) + goto skel_destroy; + + if (endpoint_init("subflow") < 0) + goto close_netns; + + run_subflow(); + +close_netns: + cleanup_netns(nstoken); +skel_destroy: + mptcp_subflow__destroy(skel); +close_cgroup: + close(cgroup_fd); +} + void test_mptcp(void) { if (test__start_subtest("base")) test_base(); if (test__start_subtest("mptcpify")) test_mptcpify(); + if (test__start_subtest("subflow")) + test_subflow(); } diff --git a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c index 71d8f3ba7d6ba..ac3c3c097c0e4 100644 --- a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c @@ -8,12 +8,16 @@ #define SO_NETNS_COOKIE 71 #endif +#define loopback 1 + static int duration; void test_netns_cookie(void) { + LIBBPF_OPTS(bpf_prog_attach_opts, opta); + LIBBPF_OPTS(bpf_prog_detach_opts, optd); int server_fd = -1, client_fd = -1, cgroup_fd = -1; - int err, val, ret, map, verdict; + int err, val, ret, map, verdict, tc_fd; struct netns_cookie_prog *skel; uint64_t cookie_expected_value; socklen_t vallen = sizeof(cookie_expected_value); @@ -38,36 +42,47 @@ void test_netns_cookie(void) if (!ASSERT_OK(err, "prog_attach")) goto done; + tc_fd = bpf_program__fd(skel->progs.get_netns_cookie_tcx); + err = bpf_prog_attach_opts(tc_fd, loopback, BPF_TCX_INGRESS, &opta); + if (!ASSERT_OK(err, "prog_attach")) + goto done; + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno)) - goto done; + goto cleanup_tc; client_fd = connect_to_fd(server_fd, 0); if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno)) - goto done; + goto cleanup_tc; ret = send(client_fd, send_msg, sizeof(send_msg), 0); if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n", ret)) - goto done; + goto cleanup_tc; err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sockops_netns_cookies), &client_fd, &val); if (!ASSERT_OK(err, "map_lookup(sockops_netns_cookies)")) - goto done; + goto cleanup_tc; err = getsockopt(client_fd, SOL_SOCKET, SO_NETNS_COOKIE, &cookie_expected_value, &vallen); if (!ASSERT_OK(err, "getsockopt")) - goto done; + goto cleanup_tc; ASSERT_EQ(val, cookie_expected_value, "cookie_value"); err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sk_msg_netns_cookies), &client_fd, &val); if (!ASSERT_OK(err, "map_lookup(sk_msg_netns_cookies)")) - goto done; + goto cleanup_tc; ASSERT_EQ(val, cookie_expected_value, "cookie_value"); + ASSERT_EQ(skel->bss->tcx_init_netns_cookie, cookie_expected_value, "cookie_value"); + ASSERT_EQ(skel->bss->tcx_netns_cookie, cookie_expected_value, "cookie_value"); + +cleanup_tc: + err = bpf_prog_detach_opts(tc_fd, loopback, BPF_TCX_INGRESS, &optd); + ASSERT_OK(err, "prog_detach"); done: if (server_fd != -1) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c index b9135720024cc..151a4210028f8 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c @@ -14,7 +14,9 @@ #include "netlink_helpers.h" #include "tc_helpers.h" -#define ICMP_ECHO 8 +#define MARK 42 +#define PRIO 0xeb9f +#define ICMP_ECHO 8 struct icmphdr { __u8 type; @@ -33,7 +35,7 @@ struct iplink_req { }; static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, - bool same_netns) + bool same_netns, int scrub, int peer_scrub) { struct rtnl_handle rth = { .fd = -1 }; struct iplink_req req = {}; @@ -58,6 +60,8 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA); addattr32(&req.n, sizeof(req), IFLA_NETKIT_POLICY, policy); addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_POLICY, peer_policy); + addattr32(&req.n, sizeof(req), IFLA_NETKIT_SCRUB, scrub); + addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_SCRUB, peer_scrub); addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode); addattr_nest_end(&req.n, data); addattr_nest_end(&req.n, linkinfo); @@ -118,9 +122,9 @@ static void destroy_netkit(void) static int __send_icmp(__u32 dest) { + int sock, ret, mark = MARK, prio = PRIO; struct sockaddr_in addr; struct icmphdr icmp; - int sock, ret; ret = write_sysctl("/proc/sys/net/ipv4/ping_group_range", "0 0"); if (!ASSERT_OK(ret, "write_sysctl(net.ipv4.ping_group_range)")) @@ -135,6 +139,15 @@ static int __send_icmp(__u32 dest) if (!ASSERT_OK(ret, "setsockopt(SO_BINDTODEVICE)")) goto out; + ret = setsockopt(sock, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)); + if (!ASSERT_OK(ret, "setsockopt(SO_MARK)")) + goto out; + + ret = setsockopt(sock, SOL_SOCKET, SO_PRIORITY, + &prio, sizeof(prio)); + if (!ASSERT_OK(ret, "setsockopt(SO_PRIORITY)")) + goto out; + memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(dest); @@ -171,7 +184,8 @@ void serial_test_tc_netkit_basic(void) int err, ifindex; err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS, - &ifindex, false); + &ifindex, false, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT); if (err) return; @@ -285,7 +299,8 @@ static void serial_test_tc_netkit_multi_links_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false); + &ifindex, false, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT); if (err) return; @@ -413,7 +428,8 @@ static void serial_test_tc_netkit_multi_opts_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false); + &ifindex, false, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT); if (err) return; @@ -527,7 +543,8 @@ void serial_test_tc_netkit_device(void) int err, ifindex, ifindex2; err = create_netkit(NETKIT_L3, NETKIT_PASS, NETKIT_PASS, - &ifindex, true); + &ifindex, true, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT); if (err) return; @@ -638,7 +655,8 @@ static void serial_test_tc_netkit_neigh_links_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false); + &ifindex, false, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT); if (err) return; @@ -715,7 +733,8 @@ static void serial_test_tc_netkit_pkt_type_mode(int mode) struct bpf_link *link; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, true); + &ifindex, true, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT); if (err) return; @@ -779,3 +798,60 @@ void serial_test_tc_netkit_pkt_type(void) serial_test_tc_netkit_pkt_type_mode(NETKIT_L2); serial_test_tc_netkit_pkt_type_mode(NETKIT_L3); } + +static void serial_test_tc_netkit_scrub_type(int scrub) +{ + LIBBPF_OPTS(bpf_netkit_opts, optl); + struct test_tc_link *skel; + struct bpf_link *link; + int err, ifindex; + + err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS, + &ifindex, false, scrub, scrub); + if (err) + return; + + skel = test_tc_link__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc8, + BPF_NETKIT_PRIMARY), 0, "tc8_attach_type"); + + err = test_tc_link__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0); + + ASSERT_EQ(skel->bss->seen_tc8, false, "seen_tc8"); + + link = bpf_program__attach_netkit(skel->progs.tc8, ifindex, &optl); + if (!ASSERT_OK_PTR(link, "link_attach")) + goto cleanup; + + skel->links.tc8 = link; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1); + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0); + + tc_skel_reset_all_seen(skel); + ASSERT_EQ(send_icmp(), 0, "icmp_pkt"); + + ASSERT_EQ(skel->bss->seen_tc8, true, "seen_tc8"); + ASSERT_EQ(skel->bss->mark, scrub == NETKIT_SCRUB_NONE ? MARK : 0, "mark"); + ASSERT_EQ(skel->bss->prio, scrub == NETKIT_SCRUB_NONE ? PRIO : 0, "prio"); +cleanup: + test_tc_link__destroy(skel); + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0); + destroy_netkit(); +} + +void serial_test_tc_netkit_scrub(void) +{ + serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT); + serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c index 481626a875d1c..c7f74f068e788 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c @@ -2,35 +2,41 @@ #include #include #include +#include #include "test_xdp_with_cpumap_frags_helpers.skel.h" #include "test_xdp_with_cpumap_helpers.skel.h" #define IFINDEX_LO 1 +#define TEST_NS "cpu_attach_ns" static void test_xdp_with_cpumap_helpers(void) { - struct test_xdp_with_cpumap_helpers *skel; + struct test_xdp_with_cpumap_helpers *skel = NULL; struct bpf_prog_info info = {}; __u32 len = sizeof(info); struct bpf_cpumap_val val = { .qsize = 192, }; - int err, prog_fd, map_fd; + int err, prog_fd, prog_redir_fd, map_fd; + struct nstoken *nstoken = NULL; __u32 idx = 0; + SYS(out_close, "ip netns add %s", TEST_NS); + nstoken = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto out_close; + SYS(out_close, "ip link set dev lo up"); + skel = test_xdp_with_cpumap_helpers__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_xdp_with_cpumap_helpers__open_and_load")) return; - prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog); - err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL); + prog_redir_fd = bpf_program__fd(skel->progs.xdp_redir_prog); + err = bpf_xdp_attach(IFINDEX_LO, prog_redir_fd, XDP_FLAGS_SKB_MODE, NULL); if (!ASSERT_OK(err, "Generic attach of program with 8-byte CPUMAP")) goto out_close; - err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL); - ASSERT_OK(err, "XDP program detach"); - prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm); map_fd = bpf_map__fd(skel->maps.cpu_map); err = bpf_prog_get_info_by_fd(prog_fd, &info, &len); @@ -45,6 +51,26 @@ static void test_xdp_with_cpumap_helpers(void) ASSERT_OK(err, "Read cpumap entry"); ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to cpumap entry prog_id"); + /* send a packet to trigger any potential bugs in there */ + char data[10] = {}; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data, + .data_size_in = 10, + .flags = BPF_F_TEST_XDP_LIVE_FRAMES, + .repeat = 1, + ); + err = bpf_prog_test_run_opts(prog_redir_fd, &opts); + ASSERT_OK(err, "XDP test run"); + + /* wait for the packets to be flushed, then check that redirect has been + * performed + */ + kern_sync_rcu(); + ASSERT_NEQ(skel->bss->redirect_count, 0, "redirected packets"); + + err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL); + ASSERT_OK(err, "XDP program detach"); + /* can not attach BPF_XDP_CPUMAP program to a device */ err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL); if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_CPUMAP program")) @@ -65,6 +91,8 @@ static void test_xdp_with_cpumap_helpers(void) ASSERT_NEQ(err, 0, "Add BPF_XDP program with frags to cpumap entry"); out_close: + close_netns(nstoken); + SYS_NOFAIL("ip netns del %s", TEST_NS); test_xdp_with_cpumap_helpers__destroy(skel); } @@ -111,7 +139,7 @@ static void test_xdp_with_cpumap_frags_helpers(void) test_xdp_with_cpumap_frags_helpers__destroy(skel); } -void serial_test_xdp_cpumap_attach(void) +void test_xdp_cpumap_attach(void) { if (test__start_subtest("CPUMAP with programs in entries")) test_xdp_with_cpumap_helpers(); diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf.h b/tools/testing/selftests/bpf/progs/mptcp_bpf.h new file mode 100644 index 0000000000000..3b188ccdcc404 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_bpf.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __MPTCP_BPF_H__ +#define __MPTCP_BPF_H__ + +#include "bpf_experimental.h" + +/* list helpers from include/linux/list.h */ +static inline int list_is_head(const struct list_head *list, + const struct list_head *head) +{ + return list == head; +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) + +#define list_entry_is_head(pos, head, member) \ + list_is_head(&pos->member, (head)) + +/* small difference: 'can_loop' has been added in the conditions */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + !list_entry_is_head(pos, head, member) && can_loop; \ + pos = list_next_entry(pos, member)) + +/* mptcp helpers from protocol.h */ +#define mptcp_for_each_subflow(__msk, __subflow) \ + list_for_each_entry(__subflow, &((__msk)->conn_list), node) + +static __always_inline struct sock * +mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) +{ + return subflow->tcp_sock; +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c new file mode 100644 index 0000000000000..70302477e326e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020, Tessares SA. */ +/* Copyright (c) 2024, Kylin Software */ + +/* vmlinux.h, bpf_helpers.h and other 'define' */ +#include "bpf_tracing_net.h" +#include "mptcp_bpf.h" + +char _license[] SEC("license") = "GPL"; + +char cc[TCP_CA_NAME_MAX] = "reno"; +int pid; + +/* Associate a subflow counter to each token */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 100); +} mptcp_sf SEC(".maps"); + +SEC("sockops") +int mptcp_subflow(struct bpf_sock_ops *skops) +{ + __u32 init = 1, key, mark, *cnt; + struct mptcp_sock *msk; + struct bpf_sock *sk; + int err; + + if (skops->op != BPF_SOCK_OPS_TCP_CONNECT_CB) + return 1; + + sk = skops->sk; + if (!sk) + return 1; + + msk = bpf_skc_to_mptcp_sock(sk); + if (!msk) + return 1; + + key = msk->token; + cnt = bpf_map_lookup_elem(&mptcp_sf, &key); + if (cnt) { + /* A new subflow is added to an existing MPTCP connection */ + __sync_fetch_and_add(cnt, 1); + mark = *cnt; + } else { + /* A new MPTCP connection is just initiated and this is its primary subflow */ + bpf_map_update_elem(&mptcp_sf, &key, &init, BPF_ANY); + mark = init; + } + + /* Set the mark of the subflow's socket based on appearance order */ + err = bpf_setsockopt(skops, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)); + if (err < 0) + return 1; + if (mark == 2) + err = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, cc, TCP_CA_NAME_MAX); + + return 1; +} + +static int _check_getsockopt_subflow_mark(struct mptcp_sock *msk, struct bpf_sockopt *ctx) +{ + struct mptcp_subflow_context *subflow; + int i = 0; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow, + struct mptcp_subflow_context)); + + if (ssk->sk_mark != ++i) { + ctx->retval = -2; + break; + } + } + + return 1; +} + +static int _check_getsockopt_subflow_cc(struct mptcp_sock *msk, struct bpf_sockopt *ctx) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct inet_connection_sock *icsk; + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow, + struct mptcp_subflow_context)); + icsk = bpf_core_cast(ssk, struct inet_connection_sock); + + if (ssk->sk_mark == 2 && + __builtin_memcmp(icsk->icsk_ca_ops->name, cc, TCP_CA_NAME_MAX)) { + ctx->retval = -2; + break; + } + } + + return 1; +} + +SEC("cgroup/getsockopt") +int _getsockopt_subflow(struct bpf_sockopt *ctx) +{ + struct bpf_sock *sk = ctx->sk; + struct mptcp_sock *msk; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + if (!sk || sk->protocol != IPPROTO_MPTCP || + (!(ctx->level == SOL_SOCKET && ctx->optname == SO_MARK) && + !(ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION))) + return 1; + + msk = bpf_core_cast(sk, struct mptcp_sock); + if (msk->pm.subflows != 1) { + ctx->retval = -1; + return 1; + } + + if (ctx->optname == SO_MARK) + return _check_getsockopt_subflow_mark(msk, ctx); + return _check_getsockopt_subflow_cc(msk, ctx); +} diff --git a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c index aeff3a4f92875..c6edf8dbefebe 100644 --- a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c +++ b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c @@ -27,6 +27,8 @@ struct { __type(value, __u64); } sock_map SEC(".maps"); +int tcx_init_netns_cookie, tcx_netns_cookie; + SEC("sockops") int get_netns_cookie_sockops(struct bpf_sock_ops *ctx) { @@ -81,4 +83,12 @@ int get_netns_cookie_sk_msg(struct sk_msg_md *msg) return 1; } +SEC("tcx/ingress") +int get_netns_cookie_tcx(struct __sk_buff *skb) +{ + tcx_init_netns_cookie = bpf_get_netns_cookie(NULL); + tcx_netns_cookie = bpf_get_netns_cookie(skb); + return TCX_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c index ab3eae3d6af87..10d8259284994 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_link.c +++ b/tools/testing/selftests/bpf/progs/test_tc_link.c @@ -18,6 +18,7 @@ bool seen_tc4; bool seen_tc5; bool seen_tc6; bool seen_tc7; +bool seen_tc8; bool set_type; @@ -25,6 +26,8 @@ bool seen_eth; bool seen_host; bool seen_mcast; +int mark, prio; + SEC("tc/ingress") int tc1(struct __sk_buff *skb) { @@ -100,3 +103,12 @@ int tc7(struct __sk_buff *skb) seen_tc7 = true; return TCX_PASS; } + +SEC("tc/egress") +int tc8(struct __sk_buff *skb) +{ + seen_tc8 = true; + mark = skb->mark; + prio = skb->priority; + return TCX_PASS; +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c index 20ec6723df18a..3619239b01b74 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c @@ -12,10 +12,12 @@ struct { __uint(max_entries, 4); } cpu_map SEC(".maps"); +__u32 redirect_count = 0; + SEC("xdp") int xdp_redir_prog(struct xdp_md *ctx) { - return bpf_redirect_map(&cpu_map, 1, 0); + return bpf_redirect_map(&cpu_map, 0, 0); } SEC("xdp") @@ -27,6 +29,9 @@ int xdp_dummy_prog(struct xdp_md *ctx) SEC("xdp/cpumap") int xdp_dummy_cm(struct xdp_md *ctx) { + if (bpf_get_smp_processor_id() == 0) + redirect_count++; + if (ctx->ingress_ifindex == IFINDEX_LO) return XDP_DROP;