From efbaec89c642cd1d4977fc6df9923697e1598d4e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 28 Aug 2018 07:42:32 +0000 Subject: [PATCH 01/28] bpf: remove duplicated include from syscall.c Remove duplicated include. Signed-off-by: YueHaibing Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8339d81cba1db..3c9636f03bb2e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ From b5d83fec732dc98e37b22e8b20a7943aa0bb55d9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 28 Aug 2018 09:10:45 -0700 Subject: [PATCH 02/28] bpf: sockmap test remove shutdown() calls Currently, we do a shutdown(sk, SHUT_RDWR) on both peer sockets and a shutdown on the sender as well. However, this is incorrect and can occasionally cause issues if you happen to have bad timing. First peer1 or peer2 may still be in use depending on the test and timing. Second we really should only be closing the read side and/or write side depending on if the test is receiving or sending. But, really none of this is needed just remove the shutdown calls. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_sockmap.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 0c7d9e556b47d..a0e77c6bb0cf2 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -469,8 +469,6 @@ static int sendmsg_test(struct sockmap_options *opt) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", iov_count, iov_buf, cnt, err); - shutdown(p2, SHUT_RDWR); - shutdown(p1, SHUT_RDWR); if (s.end.tv_sec - s.start.tv_sec) { sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); @@ -500,7 +498,6 @@ static int sendmsg_test(struct sockmap_options *opt) fprintf(stderr, "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n", iov_count, iov_buf, cnt, err); - shutdown(c1, SHUT_RDWR); if (s.end.tv_sec - s.start.tv_sec) { sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); From 7d2c6cfc5411207f1094e7ca5e63e711dc76d1ff Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 28 Aug 2018 09:10:50 -0700 Subject: [PATCH 03/28] bpf: use --cgroup in test_suite if supplied If the user supplies a --cgroup value in the arguments when running the test_suite go ahaead and run the self tests there. I use this to test with multiple cgroup users. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_sockmap.c | 53 +++++++++++++--------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index a0e77c6bb0cf2..ac7de38e5c63f 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1345,9 +1345,9 @@ static int populate_progs(char *bpf_file) return 0; } -static int __test_suite(char *bpf_file) +static int __test_suite(int cg_fd, char *bpf_file) { - int cg_fd, err; + int err, cleanup = cg_fd; err = populate_progs(bpf_file); if (err < 0) { @@ -1355,22 +1355,24 @@ static int __test_suite(char *bpf_file) return err; } - if (setup_cgroup_environment()) { - fprintf(stderr, "ERROR: cgroup env failed\n"); - return -EINVAL; - } - - cg_fd = create_and_get_cgroup(CG_PATH); if (cg_fd < 0) { - fprintf(stderr, - "ERROR: (%i) open cg path failed: %s\n", - cg_fd, optarg); - return cg_fd; - } + if (setup_cgroup_environment()) { + fprintf(stderr, "ERROR: cgroup env failed\n"); + return -EINVAL; + } + + cg_fd = create_and_get_cgroup(CG_PATH); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, optarg); + return cg_fd; + } - if (join_cgroup(CG_PATH)) { - fprintf(stderr, "ERROR: failed to join cgroup\n"); - return -EINVAL; + if (join_cgroup(CG_PATH)) { + fprintf(stderr, "ERROR: failed to join cgroup\n"); + return -EINVAL; + } } /* Tests basic commands and APIs with range of iov values */ @@ -1391,20 +1393,24 @@ static int __test_suite(char *bpf_file) out: printf("Summary: %i PASSED %i FAILED\n", passed, failed); - cleanup_cgroup_environment(); - close(cg_fd); + if (cleanup < 0) { + cleanup_cgroup_environment(); + close(cg_fd); + } return err; } -static int test_suite(void) +static int test_suite(int cg_fd) { int err; - err = __test_suite(BPF_SOCKMAP_FILENAME); + err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME); if (err) goto out; - err = __test_suite(BPF_SOCKHASH_FILENAME); + err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME); out: + if (cg_fd > -1) + close(cg_fd); return err; } @@ -1417,7 +1423,7 @@ int main(int argc, char **argv) int test = PING_PONG; if (argc < 2) - return test_suite(); + return test_suite(-1); while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:", long_options, &longindex)) != -1) { @@ -1483,6 +1489,9 @@ int main(int argc, char **argv) } } + if (argc <= 3 && cg_fd) + return test_suite(cg_fd); + if (!cg_fd) { fprintf(stderr, "%s requires cgroup option: --cgroup \n", argv[0]); From b0d1beeff2a97a0cf1965ea8f1d13b8973f22582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:25 +0200 Subject: [PATCH 04/28] xdp: implement convert_to_xdp_frame for MEM_TYPE_ZERO_COPY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds proper MEM_TYPE_ZERO_COPY support for convert_to_xdp_frame. Converting a MEM_TYPE_ZERO_COPY xdp_buff to an xdp_frame is done by transforming the MEM_TYPE_ZERO_COPY buffer into a MEM_TYPE_PAGE_ORDER0 frame. This is costly, and in the future it might make sense to implement a more sophisticated thread-safe alloc/free scheme for MEM_TYPE_ZERO_COPY, so that no allocation and copy is required in the fast-path. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 5 +++-- net/core/xdp.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 76b95256c2664..0d5c6fb4b2e26 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -91,6 +91,8 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); + /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) @@ -99,9 +101,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - /* TODO: implement clone, copy, use "native" MEM_TYPE */ if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) - return NULL; + return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; diff --git a/net/core/xdp.c b/net/core/xdp.c index 89b6785cef2ad..be6cb2f0e7222 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -398,3 +398,42 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, info->flags = bpf->flags; } EXPORT_SYMBOL_GPL(xdp_attachment_setup); + +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) +{ + unsigned int metasize, headroom, totsize; + void *addr, *data_to_copy; + struct xdp_frame *xdpf; + struct page *page; + + /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */ + metasize = xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; + headroom = xdp->data - xdp->data_hard_start; + totsize = xdp->data_end - xdp->data + metasize; + + if (sizeof(*xdpf) + totsize > PAGE_SIZE) + return NULL; + + page = dev_alloc_page(); + if (!page) + return NULL; + + addr = page_to_virt(page); + xdpf = addr; + memset(xdpf, 0, sizeof(*xdpf)); + + addr += sizeof(*xdpf); + data_to_copy = metasize ? xdp->data_meta : xdp->data; + memcpy(addr, data_to_copy, totsize); + + xdpf->data = addr + metasize; + xdpf->len = totsize - metasize; + xdpf->headroom = 0; + xdpf->metasize = metasize; + xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + + xdp_return_buff(xdp); + return xdpf; +} +EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); From dce5bd6140a436e3348f6d13a1efb6e6c5a89acd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:26 +0200 Subject: [PATCH 05/28] xdp: export xdp_rxq_info_unreg_mem_model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export __xdp_rxq_info_unreg_mem_model as xdp_rxq_info_unreg_mem_model, so it can be used from netdev drivers. Also, add additional checks for the memory type. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 + net/core/xdp.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 0d5c6fb4b2e26..0f25b3675c5cd 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -136,6 +136,7 @@ void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); +void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. diff --git a/net/core/xdp.c b/net/core/xdp.c index be6cb2f0e7222..654dbb19707e7 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -94,11 +94,21 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) kfree(xa); } -static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; int id = xdp_rxq->mem.id; + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return; + } + + if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL && + xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) { + return; + } + if (id == 0) return; @@ -110,6 +120,7 @@ static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) mutex_unlock(&mem_id_lock); } +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { @@ -119,7 +130,7 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); - __xdp_rxq_info_unreg_mem_model(xdp_rxq); + xdp_rxq_info_unreg_mem_model(xdp_rxq); xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; From 902540342096af8a13351f6a22bfdd7a8e19ffd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:27 +0200 Subject: [PATCH 06/28] xsk: expose xdp_umem_get_{data,dma} to drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the xdp_umem_get_{data,dma} functions to include/net/xdp_sock.h, so that the upcoming zero-copy implementation in the Ethernet drivers can utilize them. Also, supply some dummy function implementations for CONFIG_XDP_SOCKETS=n configs. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 43 ++++++++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 10 ---------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 7161856bcf9c7..56994ad1ab409 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -79,6 +79,16 @@ void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); void xsk_umem_consume_tx_done(struct xdp_umem *umem); + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) +{ + return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); +} + +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); +} #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -98,6 +108,39 @@ static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return false; } + +static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +{ + return NULL; +} + +static inline void xsk_umem_discard_addr(struct xdp_umem *umem) +{ +} + +static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ +} + +static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, + u32 *len) +{ + return false; +} + +static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ +} + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index f11560334f884..c8be1ad3eb88a 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -8,16 +8,6 @@ #include -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); -} - int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags); bool xdp_umem_validate_queues(struct xdp_umem *umem); From 6c5c9581044dd6e0cd284ab653502fb9264f08b6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 28 Aug 2018 14:44:28 +0200 Subject: [PATCH 07/28] net: add napi_if_scheduled_mark_missed The function napi_if_scheduled_mark_missed is used to check if the NAPI context is scheduled, if so set NAPIF_STATE_MISSED and return true. Used by the AF_XDP zero-copy i40e Tx code implementation in order to make sure that irq affinity is honored by the napi context. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ca5ab98053c8d..4271f6b4e4197 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -535,6 +535,32 @@ static inline void napi_synchronize(const struct napi_struct *n) barrier(); } +/** + * napi_if_scheduled_mark_missed - if napi is running, set the + * NAPIF_STATE_MISSED + * @n: NAPI context + * + * If napi is running, set the NAPIF_STATE_MISSED, and return true if + * NAPI is scheduled. + **/ +static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) +{ + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + if (val & NAPIF_STATE_DISABLE) + return true; + + if (!(val & NAPIF_STATE_SCHED)) + return false; + + new = val | NAPIF_STATE_MISSED; + } while (cmpxchg(&n->state, val, new) != val); + + return true; +} + enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, __QUEUE_STATE_STACK_XOFF, From 123cecd427b6c1af5bdc3a90131ae85581854319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:29 +0200 Subject: [PATCH 08/28] i40e: added queue pair disable/enable functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add functions for queue pair enable/disable. Instead of resetting the whole device, only the affected queue pair is disabled or enabled. This plumbing is used in a later commit, when zero-copy AF_XDP support is introduced. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_main.c | 250 ++++++++++++++++++++ 1 file changed, 250 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index ac685ad4d8773..d8b5a6af72bdd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11827,6 +11827,256 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, return 0; } +/** + * i40e_enter_busy_conf - Enters busy config state + * @vsi: vsi + * + * Returns 0 on success, <0 for failure. + **/ +static int i40e_enter_busy_conf(struct i40e_vsi *vsi) +{ + struct i40e_pf *pf = vsi->back; + int timeout = 50; + + while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state)) { + timeout--; + if (!timeout) + return -EBUSY; + usleep_range(1000, 2000); + } + + return 0; +} + +/** + * i40e_exit_busy_conf - Exits busy config state + * @vsi: vsi + **/ +static void i40e_exit_busy_conf(struct i40e_vsi *vsi) +{ + struct i40e_pf *pf = vsi->back; + + clear_bit(__I40E_CONFIG_BUSY, pf->state); +} + +/** + * i40e_queue_pair_reset_stats - Resets all statistics for a queue pair + * @vsi: vsi + * @queue_pair: queue pair + **/ +static void i40e_queue_pair_reset_stats(struct i40e_vsi *vsi, int queue_pair) +{ + memset(&vsi->rx_rings[queue_pair]->rx_stats, 0, + sizeof(vsi->rx_rings[queue_pair]->rx_stats)); + memset(&vsi->tx_rings[queue_pair]->stats, 0, + sizeof(vsi->tx_rings[queue_pair]->stats)); + if (i40e_enabled_xdp_vsi(vsi)) { + memset(&vsi->xdp_rings[queue_pair]->stats, 0, + sizeof(vsi->xdp_rings[queue_pair]->stats)); + } +} + +/** + * i40e_queue_pair_clean_rings - Cleans all the rings of a queue pair + * @vsi: vsi + * @queue_pair: queue pair + **/ +static void i40e_queue_pair_clean_rings(struct i40e_vsi *vsi, int queue_pair) +{ + i40e_clean_tx_ring(vsi->tx_rings[queue_pair]); + if (i40e_enabled_xdp_vsi(vsi)) + i40e_clean_tx_ring(vsi->xdp_rings[queue_pair]); + i40e_clean_rx_ring(vsi->rx_rings[queue_pair]); +} + +/** + * i40e_queue_pair_toggle_napi - Enables/disables NAPI for a queue pair + * @vsi: vsi + * @queue_pair: queue pair + * @enable: true for enable, false for disable + **/ +static void i40e_queue_pair_toggle_napi(struct i40e_vsi *vsi, int queue_pair, + bool enable) +{ + struct i40e_ring *rxr = vsi->rx_rings[queue_pair]; + struct i40e_q_vector *q_vector = rxr->q_vector; + + if (!vsi->netdev) + return; + + /* All rings in a qp belong to the same qvector. */ + if (q_vector->rx.ring || q_vector->tx.ring) { + if (enable) + napi_enable(&q_vector->napi); + else + napi_disable(&q_vector->napi); + } +} + +/** + * i40e_queue_pair_toggle_rings - Enables/disables all rings for a queue pair + * @vsi: vsi + * @queue_pair: queue pair + * @enable: true for enable, false for disable + * + * Returns 0 on success, <0 on failure. + **/ +static int i40e_queue_pair_toggle_rings(struct i40e_vsi *vsi, int queue_pair, + bool enable) +{ + struct i40e_pf *pf = vsi->back; + int pf_q, ret = 0; + + pf_q = vsi->base_queue + queue_pair; + ret = i40e_control_wait_tx_q(vsi->seid, pf, pf_q, + false /*is xdp*/, enable); + if (ret) { + dev_info(&pf->pdev->dev, + "VSI seid %d Tx ring %d %sable timeout\n", + vsi->seid, pf_q, (enable ? "en" : "dis")); + return ret; + } + + i40e_control_rx_q(pf, pf_q, enable); + ret = i40e_pf_rxq_wait(pf, pf_q, enable); + if (ret) { + dev_info(&pf->pdev->dev, + "VSI seid %d Rx ring %d %sable timeout\n", + vsi->seid, pf_q, (enable ? "en" : "dis")); + return ret; + } + + /* Due to HW errata, on Rx disable only, the register can + * indicate done before it really is. Needs 50ms to be sure + */ + if (!enable) + mdelay(50); + + if (!i40e_enabled_xdp_vsi(vsi)) + return ret; + + ret = i40e_control_wait_tx_q(vsi->seid, pf, + pf_q + vsi->alloc_queue_pairs, + true /*is xdp*/, enable); + if (ret) { + dev_info(&pf->pdev->dev, + "VSI seid %d XDP Tx ring %d %sable timeout\n", + vsi->seid, pf_q, (enable ? "en" : "dis")); + } + + return ret; +} + +/** + * i40e_queue_pair_enable_irq - Enables interrupts for a queue pair + * @vsi: vsi + * @queue_pair: queue_pair + **/ +static void i40e_queue_pair_enable_irq(struct i40e_vsi *vsi, int queue_pair) +{ + struct i40e_ring *rxr = vsi->rx_rings[queue_pair]; + struct i40e_pf *pf = vsi->back; + struct i40e_hw *hw = &pf->hw; + + /* All rings in a qp belong to the same qvector. */ + if (pf->flags & I40E_FLAG_MSIX_ENABLED) + i40e_irq_dynamic_enable(vsi, rxr->q_vector->v_idx); + else + i40e_irq_dynamic_enable_icr0(pf); + + i40e_flush(hw); +} + +/** + * i40e_queue_pair_disable_irq - Disables interrupts for a queue pair + * @vsi: vsi + * @queue_pair: queue_pair + **/ +static void i40e_queue_pair_disable_irq(struct i40e_vsi *vsi, int queue_pair) +{ + struct i40e_ring *rxr = vsi->rx_rings[queue_pair]; + struct i40e_pf *pf = vsi->back; + struct i40e_hw *hw = &pf->hw; + + /* For simplicity, instead of removing the qp interrupt causes + * from the interrupt linked list, we simply disable the interrupt, and + * leave the list intact. + * + * All rings in a qp belong to the same qvector. + */ + if (pf->flags & I40E_FLAG_MSIX_ENABLED) { + u32 intpf = vsi->base_vector + rxr->q_vector->v_idx; + + wr32(hw, I40E_PFINT_DYN_CTLN(intpf - 1), 0); + i40e_flush(hw); + synchronize_irq(pf->msix_entries[intpf].vector); + } else { + /* Legacy and MSI mode - this stops all interrupt handling */ + wr32(hw, I40E_PFINT_ICR0_ENA, 0); + wr32(hw, I40E_PFINT_DYN_CTL0, 0); + i40e_flush(hw); + synchronize_irq(pf->pdev->irq); + } +} + +/** + * i40e_queue_pair_disable - Disables a queue pair + * @vsi: vsi + * @queue_pair: queue pair + * + * Returns 0 on success, <0 on failure. + **/ +int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair) +{ + int err; + + err = i40e_enter_busy_conf(vsi); + if (err) + return err; + + i40e_queue_pair_disable_irq(vsi, queue_pair); + err = i40e_queue_pair_toggle_rings(vsi, queue_pair, false /* off */); + i40e_queue_pair_toggle_napi(vsi, queue_pair, false /* off */); + i40e_queue_pair_clean_rings(vsi, queue_pair); + i40e_queue_pair_reset_stats(vsi, queue_pair); + + return err; +} + +/** + * i40e_queue_pair_enable - Enables a queue pair + * @vsi: vsi + * @queue_pair: queue pair + * + * Returns 0 on success, <0 on failure. + **/ +int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair) +{ + int err; + + err = i40e_configure_tx_ring(vsi->tx_rings[queue_pair]); + if (err) + return err; + + if (i40e_enabled_xdp_vsi(vsi)) { + err = i40e_configure_tx_ring(vsi->xdp_rings[queue_pair]); + if (err) + return err; + } + + err = i40e_configure_rx_ring(vsi->rx_rings[queue_pair]); + if (err) + return err; + + err = i40e_queue_pair_toggle_rings(vsi, queue_pair, true /* on */); + i40e_queue_pair_toggle_napi(vsi, queue_pair, true /* on */); + i40e_queue_pair_enable_irq(vsi, queue_pair); + + i40e_exit_busy_conf(vsi); + + return err; +} + /** * i40e_xdp - implements ndo_bpf for i40e * @dev: netdevice From 6d7aad1da2791cc0863a5b8068dcdb8c7536519d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:30 +0200 Subject: [PATCH 09/28] i40e: refactor Rx path for re-use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit, the Rx path is refactored some, as a step torwards the introduction AF_XDP Rx zero-copy. The page re-use counter is moved into the i40e_reuse_rx_page, instead of bumping the counter in many places. The Rx buffer page clearing is moved for better readability. Lastely, functions to update statistics and bump the XDP Tx ring are introduced. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 111 ++++++++++++++------ 1 file changed, 77 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index b5042d1a63c0b..b5a2cfeb68a54 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1244,6 +1244,11 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring, new_buff->page = old_buff->page; new_buff->page_offset = old_buff->page_offset; new_buff->pagecnt_bias = old_buff->pagecnt_bias; + + rx_ring->rx_stats.page_reuse_count++; + + /* clear contents of buffer_info */ + old_buff->page = NULL; } /** @@ -1266,7 +1271,7 @@ static inline bool i40e_rx_is_programming_status(u64 qw) } /** - * i40e_clean_programming_status - clean the programming status descriptor + * i40e_clean_programming_status - try clean the programming status descriptor * @rx_ring: the rx ring that has this descriptor * @rx_desc: the rx descriptor written back by HW * @qw: qword representing status_error_len in CPU ordering @@ -1275,15 +1280,22 @@ static inline bool i40e_rx_is_programming_status(u64 qw) * status being successful or not and take actions accordingly. FCoE should * handle its context/filter programming/invalidation status and take actions. * + * Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL. **/ -static void i40e_clean_programming_status(struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, - u64 qw) +static struct i40e_rx_buffer *i40e_clean_programming_status( + struct i40e_ring *rx_ring, + union i40e_rx_desc *rx_desc, + u64 qw) { struct i40e_rx_buffer *rx_buffer; - u32 ntc = rx_ring->next_to_clean; + u32 ntc; u8 id; + if (!i40e_rx_is_programming_status(qw)) + return NULL; + + ntc = rx_ring->next_to_clean; + /* fetch, update, and store next to clean */ rx_buffer = &rx_ring->rx_bi[ntc++]; ntc = (ntc < rx_ring->count) ? ntc : 0; @@ -1291,18 +1303,13 @@ static void i40e_clean_programming_status(struct i40e_ring *rx_ring, prefetch(I40E_RX_DESC(rx_ring, ntc)); - /* place unused page back on the ring */ - i40e_reuse_rx_page(rx_ring, rx_buffer); - rx_ring->rx_stats.page_reuse_count++; - - /* clear contents of buffer_info */ - rx_buffer->page = NULL; - id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >> I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT; if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS) i40e_fd_handle_status(rx_ring, rx_desc, id); + + return rx_buffer; } /** @@ -2152,7 +2159,6 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, if (i40e_can_reuse_rx_page(rx_buffer)) { /* hand second half of page back to the ring */ i40e_reuse_rx_page(rx_ring, rx_buffer); - rx_ring->rx_stats.page_reuse_count++; } else { /* we are not reusing the buffer so unmap it */ dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, @@ -2160,10 +2166,9 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, DMA_FROM_DEVICE, I40E_RX_DMA_ATTR); __page_frag_cache_drain(rx_buffer->page, rx_buffer->pagecnt_bias); + /* clear contents of buffer_info */ + rx_buffer->page = NULL; } - - /* clear contents of buffer_info */ - rx_buffer->page = NULL; } /** @@ -2287,6 +2292,12 @@ static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring, #endif } +/** + * i40e_xdp_ring_update_tail - Updates the XDP Tx ring tail register + * @xdp_ring: XDP Tx ring + * + * This function updates the XDP Tx ring tail register. + **/ static inline void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring) { /* Force memory writes to complete before letting h/w @@ -2296,6 +2307,49 @@ static inline void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring) writel_relaxed(xdp_ring->next_to_use, xdp_ring->tail); } +/** + * i40e_update_rx_stats - Update Rx ring statistics + * @rx_ring: rx descriptor ring + * @total_rx_bytes: number of bytes received + * @total_rx_packets: number of packets received + * + * This function updates the Rx ring statistics. + **/ +static void i40e_update_rx_stats(struct i40e_ring *rx_ring, + unsigned int total_rx_bytes, + unsigned int total_rx_packets) +{ + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->stats.packets += total_rx_packets; + rx_ring->stats.bytes += total_rx_bytes; + u64_stats_update_end(&rx_ring->syncp); + rx_ring->q_vector->rx.total_packets += total_rx_packets; + rx_ring->q_vector->rx.total_bytes += total_rx_bytes; +} + +/** + * i40e_finalize_xdp_rx - Bump XDP Tx tail and/or flush redirect map + * @rx_ring: Rx ring + * @xdp_res: Result of the receive batch + * + * This function bumps XDP Tx tail and/or flush redirect map, and + * should be called when a batch of packets has been processed in the + * napi loop. + **/ +static void i40e_finalize_xdp_rx(struct i40e_ring *rx_ring, + unsigned int xdp_res) +{ + if (xdp_res & I40E_XDP_REDIR) + xdp_do_flush_map(); + + if (xdp_res & I40E_XDP_TX) { + struct i40e_ring *xdp_ring = + rx_ring->vsi->xdp_rings[rx_ring->queue_index]; + + i40e_xdp_ring_update_tail(xdp_ring); + } +} + /** * i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf * @rx_ring: rx descriptor ring to transact packets on @@ -2349,11 +2403,14 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) */ dma_rmb(); - if (unlikely(i40e_rx_is_programming_status(qword))) { - i40e_clean_programming_status(rx_ring, rx_desc, qword); + rx_buffer = i40e_clean_programming_status(rx_ring, rx_desc, + qword); + if (unlikely(rx_buffer)) { + i40e_reuse_rx_page(rx_ring, rx_buffer); cleaned_count++; continue; } + size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT; if (!size) @@ -2432,24 +2489,10 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) total_rx_packets++; } - if (xdp_xmit & I40E_XDP_REDIR) - xdp_do_flush_map(); - - if (xdp_xmit & I40E_XDP_TX) { - struct i40e_ring *xdp_ring = - rx_ring->vsi->xdp_rings[rx_ring->queue_index]; - - i40e_xdp_ring_update_tail(xdp_ring); - } - + i40e_finalize_xdp_rx(rx_ring, xdp_xmit); rx_ring->skb = skb; - u64_stats_update_begin(&rx_ring->syncp); - rx_ring->stats.packets += total_rx_packets; - rx_ring->stats.bytes += total_rx_bytes; - u64_stats_update_end(&rx_ring->syncp); - rx_ring->q_vector->rx.total_packets += total_rx_packets; - rx_ring->q_vector->rx.total_bytes += total_rx_bytes; + i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets); /* guarantee a trip back through this routine if there was a failure */ return failure ? budget : (int)total_rx_packets; From 20a739dbef28fb3c17de2789a3d7847efec6a3ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:31 +0200 Subject: [PATCH 10/28] i40e: move common Rx functions to i40e_txrx_common.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch prepares for the upcoming zero-copy Rx functionality, by moving/changing linkage of common functions, used both by the regular path and zero-copy path. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 33 ++++++++----------- .../ethernet/intel/i40e/i40e_txrx_common.h | 31 +++++++++++++++++ 2 files changed, 44 insertions(+), 20 deletions(-) create mode 100644 drivers/net/ethernet/intel/i40e/i40e_txrx_common.h diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index b5a2cfeb68a54..878fb4b47484b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -8,6 +8,7 @@ #include "i40e.h" #include "i40e_trace.h" #include "i40e_prototype.h" +#include "i40e_txrx_common.h" static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size, u32 td_tag) @@ -536,8 +537,8 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi, * This is used to verify if the FD programming or invalidation * requested by SW to the HW is successful or not and take actions accordingly. **/ -static void i40e_fd_handle_status(struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, u8 prog_id) +void i40e_fd_handle_status(struct i40e_ring *rx_ring, + union i40e_rx_desc *rx_desc, u8 prog_id) { struct i40e_pf *pf = rx_ring->vsi->back; struct pci_dev *pdev = pf->pdev; @@ -1282,7 +1283,7 @@ static inline bool i40e_rx_is_programming_status(u64 qw) * * Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL. **/ -static struct i40e_rx_buffer *i40e_clean_programming_status( +struct i40e_rx_buffer *i40e_clean_programming_status( struct i40e_ring *rx_ring, union i40e_rx_desc *rx_desc, u64 qw) @@ -1499,7 +1500,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) * @rx_ring: ring to bump * @val: new head index **/ -static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val) +void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val) { rx_ring->next_to_use = val; @@ -1583,8 +1584,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring, * @skb: packet to send up * @vlan_tag: vlan tag for packet **/ -static void i40e_receive_skb(struct i40e_ring *rx_ring, - struct sk_buff *skb, u16 vlan_tag) +void i40e_receive_skb(struct i40e_ring *rx_ring, + struct sk_buff *skb, u16 vlan_tag) { struct i40e_q_vector *q_vector = rx_ring->q_vector; @@ -1811,7 +1812,6 @@ static inline void i40e_rx_hash(struct i40e_ring *ring, * order to populate the hash, checksum, VLAN, protocol, and * other fields within the skb. **/ -static inline void i40e_process_skb_fields(struct i40e_ring *rx_ring, union i40e_rx_desc *rx_desc, struct sk_buff *skb, u8 rx_ptype) @@ -2204,16 +2204,10 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring, return true; } -#define I40E_XDP_PASS 0 -#define I40E_XDP_CONSUMED BIT(0) -#define I40E_XDP_TX BIT(1) -#define I40E_XDP_REDIR BIT(2) - static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf, struct i40e_ring *xdp_ring); -static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, - struct i40e_ring *xdp_ring) +int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring) { struct xdp_frame *xdpf = convert_to_xdp_frame(xdp); @@ -2298,7 +2292,7 @@ static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring, * * This function updates the XDP Tx ring tail register. **/ -static inline void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring) +void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring) { /* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. @@ -2315,9 +2309,9 @@ static inline void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring) * * This function updates the Rx ring statistics. **/ -static void i40e_update_rx_stats(struct i40e_ring *rx_ring, - unsigned int total_rx_bytes, - unsigned int total_rx_packets) +void i40e_update_rx_stats(struct i40e_ring *rx_ring, + unsigned int total_rx_bytes, + unsigned int total_rx_packets) { u64_stats_update_begin(&rx_ring->syncp); rx_ring->stats.packets += total_rx_packets; @@ -2336,8 +2330,7 @@ static void i40e_update_rx_stats(struct i40e_ring *rx_ring, * should be called when a batch of packets has been processed in the * napi loop. **/ -static void i40e_finalize_xdp_rx(struct i40e_ring *rx_ring, - unsigned int xdp_res) +void i40e_finalize_xdp_rx(struct i40e_ring *rx_ring, unsigned int xdp_res) { if (xdp_res & I40E_XDP_REDIR) xdp_do_flush_map(); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h new file mode 100644 index 0000000000000..2bd5187fcd669 --- /dev/null +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2018 Intel Corporation. */ + +#ifndef I40E_TXRX_COMMON_ +#define I40E_TXRX_COMMON_ + +void i40e_fd_handle_status(struct i40e_ring *rx_ring, + union i40e_rx_desc *rx_desc, u8 prog_id); +int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring); +struct i40e_rx_buffer *i40e_clean_programming_status( + struct i40e_ring *rx_ring, + union i40e_rx_desc *rx_desc, + u64 qw); +void i40e_process_skb_fields(struct i40e_ring *rx_ring, + union i40e_rx_desc *rx_desc, struct sk_buff *skb, + u8 rx_ptype); +void i40e_receive_skb(struct i40e_ring *rx_ring, + struct sk_buff *skb, u16 vlan_tag); +void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring); +void i40e_update_rx_stats(struct i40e_ring *rx_ring, + unsigned int total_rx_bytes, + unsigned int total_rx_packets); +void i40e_finalize_xdp_rx(struct i40e_ring *rx_ring, unsigned int xdp_res); +void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val); + +#define I40E_XDP_PASS 0 +#define I40E_XDP_CONSUMED BIT(0) +#define I40E_XDP_TX BIT(1) +#define I40E_XDP_REDIR BIT(2) + +#endif /* I40E_TXRX_COMMON_ */ From 0a714186d3c0f7c563a03537f98716457c1f5ae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:32 +0200 Subject: [PATCH 11/28] i40e: add AF_XDP zero-copy Rx support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds zero-copy Rx support for AF_XDP sockets. Instead of allocating buffers of type MEM_TYPE_PAGE_SHARED, the Rx frames are allocated as MEM_TYPE_ZERO_COPY when AF_XDP is enabled for a certain queue. All AF_XDP specific functions are added to a new file, i40e_xsk.c. Note that when AF_XDP zero-copy is enabled, the XDP action XDP_PASS will allocate a new buffer and copy the zero-copy frame prior passing it to the kernel stack. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/Makefile | 3 +- drivers/net/ethernet/intel/i40e/i40e.h | 19 + drivers/net/ethernet/intel/i40e/i40e_main.c | 53 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 9 +- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 20 +- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 661 ++++++++++++++++++++ drivers/net/ethernet/intel/i40e/i40e_xsk.h | 21 + 7 files changed, 775 insertions(+), 11 deletions(-) create mode 100644 drivers/net/ethernet/intel/i40e/i40e_xsk.c create mode 100644 drivers/net/ethernet/intel/i40e/i40e_xsk.h diff --git a/drivers/net/ethernet/intel/i40e/Makefile b/drivers/net/ethernet/intel/i40e/Makefile index 14397e7e9925e..50590e8d1fd13 100644 --- a/drivers/net/ethernet/intel/i40e/Makefile +++ b/drivers/net/ethernet/intel/i40e/Makefile @@ -22,6 +22,7 @@ i40e-objs := i40e_main.o \ i40e_txrx.o \ i40e_ptp.o \ i40e_client.o \ - i40e_virtchnl_pf.o + i40e_virtchnl_pf.o \ + i40e_xsk.o i40e-$(CONFIG_I40E_DCB) += i40e_dcb.o i40e_dcb_nl.o diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 7a80652e25008..876cac317e795 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -786,6 +786,11 @@ struct i40e_vsi { /* VSI specific handlers */ irqreturn_t (*irq_handler)(int irq, void *data); + + /* AF_XDP zero-copy */ + struct xdp_umem **xsk_umems; + u16 num_xsk_umems_used; + u16 num_xsk_umems; } ____cacheline_internodealigned_in_smp; struct i40e_netdev_priv { @@ -1090,6 +1095,20 @@ static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi *vsi) return !!vsi->xdp_prog; } +static inline struct xdp_umem *i40e_xsk_umem(struct i40e_ring *ring) +{ + bool xdp_on = i40e_enabled_xdp_vsi(ring->vsi); + int qid = ring->queue_index; + + if (ring_is_xdp(ring)) + qid -= ring->vsi->alloc_queue_pairs; + + if (!ring->vsi->xsk_umems || !ring->vsi->xsk_umems[qid] || !xdp_on) + return NULL; + + return ring->vsi->xsk_umems[qid]; +} + int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch); int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate); int i40e_add_del_cloud_filter(struct i40e_vsi *vsi, diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index d8b5a6af72bdd..848eea7c84dbd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -9,7 +9,9 @@ /* Local includes */ #include "i40e.h" #include "i40e_diag.h" +#include "i40e_xsk.h" #include +#include /* All i40e tracepoints are defined by the include below, which * must be included exactly once across the whole kernel with * CREATE_TRACE_POINTS defined @@ -3181,13 +3183,46 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) struct i40e_hw *hw = &vsi->back->hw; struct i40e_hmc_obj_rxq rx_ctx; i40e_status err = 0; + bool ok; + int ret; bitmap_zero(ring->state, __I40E_RING_STATE_NBITS); /* clear the context structure first */ memset(&rx_ctx, 0, sizeof(rx_ctx)); - ring->rx_buf_len = vsi->rx_buf_len; + if (ring->vsi->type == I40E_VSI_MAIN) + xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + + ring->xsk_umem = i40e_xsk_umem(ring); + if (ring->xsk_umem) { + ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr - + XDP_PACKET_HEADROOM; + /* For AF_XDP ZC, we disallow packets to span on + * multiple buffers, thus letting us skip that + * handling in the fast-path. + */ + chain_len = 1; + ring->zca.free = i40e_zca_free; + ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_ZERO_COPY, + &ring->zca); + if (ret) + return ret; + dev_info(&vsi->back->pdev->dev, + "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n", + ring->queue_index); + + } else { + ring->rx_buf_len = vsi->rx_buf_len; + if (ring->vsi->type == I40E_VSI_MAIN) { + ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL); + if (ret) + return ret; + } + } rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT)); @@ -3243,7 +3278,15 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q); writel(0, ring->tail); - i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); + ok = ring->xsk_umem ? + i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) : + !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); + if (!ok) { + dev_info(&vsi->back->pdev->dev, + "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n", + ring->xsk_umem ? "UMEM enabled " : "", + ring->queue_index, pf_q); + } return 0; } @@ -12097,6 +12140,12 @@ static int i40e_xdp(struct net_device *dev, case XDP_QUERY_PROG: xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0; return 0; + case XDP_QUERY_XSK_UMEM: + return i40e_xsk_umem_query(vsi, &xdp->xsk.umem, + xdp->xsk.queue_id); + case XDP_SETUP_XSK_UMEM: + return i40e_xsk_umem_setup(vsi, xdp->xsk.umem, + xdp->xsk.queue_id); default: return -EINVAL; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 878fb4b47484b..2c4d179ffebfd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -9,6 +9,7 @@ #include "i40e_trace.h" #include "i40e_prototype.h" #include "i40e_txrx_common.h" +#include "i40e_xsk.h" static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size, u32 td_tag) @@ -1380,6 +1381,9 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) rx_ring->skb = NULL; } + if (rx_ring->xsk_umem) + goto skip_free; + /* Free all the Rx ring sk_buffs */ for (i = 0; i < rx_ring->count; i++) { struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; @@ -1408,6 +1412,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) rx_bi->page_offset = 0; } +skip_free: bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count; memset(rx_ring->rx_bi, 0, bi_size); @@ -2641,7 +2646,9 @@ int i40e_napi_poll(struct napi_struct *napi, int budget) budget_per_ring = max(budget/q_vector->num_ringpairs, 1); i40e_for_each_ring(ring, q_vector->rx) { - int cleaned = i40e_clean_rx_irq(ring, budget_per_ring); + int cleaned = ring->xsk_umem ? + i40e_clean_rx_irq_zc(ring, budget_per_ring) : + i40e_clean_rx_irq(ring, budget_per_ring); work_done += cleaned; /* if we clean as many as budgeted, we must not be done */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index bb04f6a731fe9..100e92d2982f2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -296,13 +296,17 @@ struct i40e_tx_buffer { struct i40e_rx_buffer { dma_addr_t dma; - struct page *page; -#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) - __u32 page_offset; -#else - __u16 page_offset; -#endif - __u16 pagecnt_bias; + union { + struct { + struct page *page; + __u32 page_offset; + __u16 pagecnt_bias; + }; + struct { + void *addr; + u64 handle; + }; + }; }; struct i40e_queue_stats { @@ -414,6 +418,8 @@ struct i40e_ring { struct i40e_channel *ch; struct xdp_rxq_info xdp_rxq; + struct xdp_umem *xsk_umem; + struct zero_copy_allocator zca; /* ZC allocator anchor */ } ____cacheline_internodealigned_in_smp; static inline bool ring_uses_build_skb(struct i40e_ring *ring) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c new file mode 100644 index 0000000000000..bf502f2307c25 --- /dev/null +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2018 Intel Corporation. */ + +#include +#include +#include + +#include "i40e.h" +#include "i40e_txrx_common.h" +#include "i40e_xsk.h" + +/** + * i40e_alloc_xsk_umems - Allocate an array to store per ring UMEMs + * @vsi: Current VSI + * + * Returns 0 on success, <0 on failure + **/ +static int i40e_alloc_xsk_umems(struct i40e_vsi *vsi) +{ + if (vsi->xsk_umems) + return 0; + + vsi->num_xsk_umems_used = 0; + vsi->num_xsk_umems = vsi->alloc_queue_pairs; + vsi->xsk_umems = kcalloc(vsi->num_xsk_umems, sizeof(*vsi->xsk_umems), + GFP_KERNEL); + if (!vsi->xsk_umems) { + vsi->num_xsk_umems = 0; + return -ENOMEM; + } + + return 0; +} + +/** + * i40e_add_xsk_umem - Store an UMEM for a certain ring/qid + * @vsi: Current VSI + * @umem: UMEM to store + * @qid: Ring/qid to associate with the UMEM + * + * Returns 0 on success, <0 on failure + **/ +static int i40e_add_xsk_umem(struct i40e_vsi *vsi, struct xdp_umem *umem, + u16 qid) +{ + int err; + + err = i40e_alloc_xsk_umems(vsi); + if (err) + return err; + + vsi->xsk_umems[qid] = umem; + vsi->num_xsk_umems_used++; + + return 0; +} + +/** + * i40e_remove_xsk_umem - Remove an UMEM for a certain ring/qid + * @vsi: Current VSI + * @qid: Ring/qid associated with the UMEM + **/ +static void i40e_remove_xsk_umem(struct i40e_vsi *vsi, u16 qid) +{ + vsi->xsk_umems[qid] = NULL; + vsi->num_xsk_umems_used--; + + if (vsi->num_xsk_umems == 0) { + kfree(vsi->xsk_umems); + vsi->xsk_umems = NULL; + vsi->num_xsk_umems = 0; + } +} + +/** + * i40e_xsk_umem_dma_map - DMA maps all UMEM memory for the netdev + * @vsi: Current VSI + * @umem: UMEM to DMA map + * + * Returns 0 on success, <0 on failure + **/ +static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem) +{ + struct i40e_pf *pf = vsi->back; + struct device *dev; + unsigned int i, j; + dma_addr_t dma; + + dev = &pf->pdev->dev; + for (i = 0; i < umem->npgs; i++) { + dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); + if (dma_mapping_error(dev, dma)) + goto out_unmap; + + umem->pages[i].dma = dma; + } + + return 0; + +out_unmap: + for (j = 0; j < i; j++) { + dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); + umem->pages[i].dma = 0; + } + + return -1; +} + +/** + * i40e_xsk_umem_dma_unmap - DMA unmaps all UMEM memory for the netdev + * @vsi: Current VSI + * @umem: UMEM to DMA map + **/ +static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem) +{ + struct i40e_pf *pf = vsi->back; + struct device *dev; + unsigned int i; + + dev = &pf->pdev->dev; + + for (i = 0; i < umem->npgs; i++) { + dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); + + umem->pages[i].dma = 0; + } +} + +/** + * i40e_xsk_umem_enable - Enable/associate an UMEM to a certain ring/qid + * @vsi: Current VSI + * @umem: UMEM + * @qid: Rx ring to associate UMEM to + * + * Returns 0 on success, <0 on failure + **/ +static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, + u16 qid) +{ + bool if_running; + int err; + + if (vsi->type != I40E_VSI_MAIN) + return -EINVAL; + + if (qid >= vsi->num_queue_pairs) + return -EINVAL; + + if (vsi->xsk_umems) { + if (qid >= vsi->num_xsk_umems) + return -EINVAL; + if (vsi->xsk_umems[qid]) + return -EBUSY; + } + + err = i40e_xsk_umem_dma_map(vsi, umem); + if (err) + return err; + + if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi); + + if (if_running) { + err = i40e_queue_pair_disable(vsi, qid); + if (err) + return err; + } + + err = i40e_add_xsk_umem(vsi, umem, qid); + if (err) + return err; + + if (if_running) { + err = i40e_queue_pair_enable(vsi, qid); + if (err) + return err; + } + + return 0; +} + +/** + * i40e_xsk_umem_disable - Diassociate an UMEM from a certain ring/qid + * @vsi: Current VSI + * @qid: Rx ring to associate UMEM to + * + * Returns 0 on success, <0 on failure + **/ +static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid) +{ + bool if_running; + int err; + + if (!vsi->xsk_umems || qid >= vsi->num_xsk_umems || + !vsi->xsk_umems[qid]) + return -EINVAL; + + if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi); + + if (if_running) { + err = i40e_queue_pair_disable(vsi, qid); + if (err) + return err; + } + + i40e_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]); + i40e_remove_xsk_umem(vsi, qid); + + if (if_running) { + err = i40e_queue_pair_enable(vsi, qid); + if (err) + return err; + } + + return 0; +} + +/** + * i40e_xsk_umem_query - Queries a certain ring/qid for its UMEM + * @vsi: Current VSI + * @umem: UMEM associated to the ring, if any + * @qid: Rx ring to associate UMEM to + * + * This function will store, if any, the UMEM associated to certain ring. + * + * Returns 0 on success, <0 on failure + **/ +int i40e_xsk_umem_query(struct i40e_vsi *vsi, struct xdp_umem **umem, + u16 qid) +{ + if (vsi->type != I40E_VSI_MAIN) + return -EINVAL; + + if (qid >= vsi->num_queue_pairs) + return -EINVAL; + + if (vsi->xsk_umems) { + if (qid >= vsi->num_xsk_umems) + return -EINVAL; + *umem = vsi->xsk_umems[qid]; + return 0; + } + + *umem = NULL; + return 0; +} + +/** + * i40e_xsk_umem_query - Queries a certain ring/qid for its UMEM + * @vsi: Current VSI + * @umem: UMEM to enable/associate to a ring, or NULL to disable + * @qid: Rx ring to (dis)associate UMEM (from)to + * + * This function enables or disables an UMEM to a certain ring. + * + * Returns 0 on success, <0 on failure + **/ +int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, + u16 qid) +{ + return umem ? i40e_xsk_umem_enable(vsi, umem, qid) : + i40e_xsk_umem_disable(vsi, qid); +} + +/** + * i40e_run_xdp_zc - Executes an XDP program on an xdp_buff + * @rx_ring: Rx ring + * @xdp: xdp_buff used as input to the XDP program + * + * This function enables or disables an UMEM to a certain ring. + * + * Returns any of I40E_XDP_{PASS, CONSUMED, TX, REDIR} + **/ +static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) +{ + int err, result = I40E_XDP_PASS; + struct i40e_ring *xdp_ring; + struct bpf_prog *xdp_prog; + u32 act; + + rcu_read_lock(); + /* NB! xdp_prog will always be !NULL, due to the fact that + * this path is enabled by setting an XDP program. + */ + xdp_prog = READ_ONCE(rx_ring->xdp_prog); + act = bpf_prog_run_xdp(xdp_prog, xdp); + xdp->handle += xdp->data - xdp->data_hard_start; + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index]; + result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring); + break; + case XDP_REDIRECT: + err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); + result = !err ? I40E_XDP_REDIR : I40E_XDP_CONSUMED; + break; + default: + bpf_warn_invalid_xdp_action(act); + case XDP_ABORTED: + trace_xdp_exception(rx_ring->netdev, xdp_prog, act); + /* fallthrough -- handle aborts by dropping packet */ + case XDP_DROP: + result = I40E_XDP_CONSUMED; + break; + } + rcu_read_unlock(); + return result; +} + +/** + * i40e_alloc_buffer_zc - Allocates an i40e_rx_buffer + * @rx_ring: Rx ring + * @bi: Rx buffer to populate + * + * This function allocates an Rx buffer. The buffer can come from fill + * queue, or via the recycle queue (next_to_alloc). + * + * Returns true for a successful allocation, false otherwise + **/ +static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring, + struct i40e_rx_buffer *bi) +{ + struct xdp_umem *umem = rx_ring->xsk_umem; + void *addr = bi->addr; + u64 handle, hr; + + if (addr) { + rx_ring->rx_stats.page_reuse_count++; + return true; + } + + if (!xsk_umem_peek_addr(umem, &handle)) { + rx_ring->rx_stats.alloc_page_failed++; + return false; + } + + hr = umem->headroom + XDP_PACKET_HEADROOM; + + bi->dma = xdp_umem_get_dma(umem, handle); + bi->dma += hr; + + bi->addr = xdp_umem_get_data(umem, handle); + bi->addr += hr; + + bi->handle = handle + umem->headroom; + + xsk_umem_discard_addr(umem); + return true; +} + +/** + * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers + * @rx_ring: Rx ring + * @count: The number of buffers to allocate + * + * This function allocates a number of Rx buffers and places them on + * the Rx ring. + * + * Returns true for a successful allocation, false otherwise + **/ +bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count) +{ + u16 ntu = rx_ring->next_to_use; + union i40e_rx_desc *rx_desc; + struct i40e_rx_buffer *bi; + bool ok = true; + + rx_desc = I40E_RX_DESC(rx_ring, ntu); + bi = &rx_ring->rx_bi[ntu]; + do { + if (!i40e_alloc_buffer_zc(rx_ring, bi)) { + ok = false; + goto no_buffers; + } + + dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0, + rx_ring->rx_buf_len, + DMA_BIDIRECTIONAL); + + rx_desc->read.pkt_addr = cpu_to_le64(bi->dma); + + rx_desc++; + bi++; + ntu++; + + if (unlikely(ntu == rx_ring->count)) { + rx_desc = I40E_RX_DESC(rx_ring, 0); + bi = rx_ring->rx_bi; + ntu = 0; + } + + rx_desc->wb.qword1.status_error_len = 0; + count--; + } while (count); + +no_buffers: + if (rx_ring->next_to_use != ntu) + i40e_release_rx_desc(rx_ring, ntu); + + return ok; +} + +/** + * i40e_get_rx_buffer_zc - Return the current Rx buffer + * @rx_ring: Rx ring + * @size: The size of the rx buffer (read from descriptor) + * + * This function returns the current, received Rx buffer, and also + * does DMA synchronization. the Rx ring. + * + * Returns the received Rx buffer + **/ +static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring, + const unsigned int size) +{ + struct i40e_rx_buffer *bi; + + bi = &rx_ring->rx_bi[rx_ring->next_to_clean]; + + /* we are reusing so sync this buffer for CPU use */ + dma_sync_single_range_for_cpu(rx_ring->dev, + bi->dma, 0, + size, + DMA_BIDIRECTIONAL); + + return bi; +} + +/** + * i40e_reuse_rx_buffer_zc - Recycle an Rx buffer + * @rx_ring: Rx ring + * @old_bi: The Rx buffer to recycle + * + * This function recycles a finished Rx buffer, and places it on the + * recycle queue (next_to_alloc). + **/ +static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring, + struct i40e_rx_buffer *old_bi) +{ + struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc]; + unsigned long mask = (unsigned long)rx_ring->xsk_umem->props.chunk_mask; + u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; + u16 nta = rx_ring->next_to_alloc; + + /* update, and store next to alloc */ + nta++; + rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; + + /* transfer page from old buffer to new buffer */ + new_bi->dma = old_bi->dma & mask; + new_bi->dma += hr; + + new_bi->addr = (void *)((unsigned long)old_bi->addr & mask); + new_bi->addr += hr; + + new_bi->handle = old_bi->handle & mask; + new_bi->handle += rx_ring->xsk_umem->headroom; + + old_bi->addr = NULL; +} + +/** + * i40e_zca_free - Free callback for MEM_TYPE_ZERO_COPY allocations + * @alloc: Zero-copy allocator + * @handle: Buffer handle + **/ +void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) +{ + struct i40e_rx_buffer *bi; + struct i40e_ring *rx_ring; + u64 hr, mask; + u16 nta; + + rx_ring = container_of(alloc, struct i40e_ring, zca); + hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; + mask = rx_ring->xsk_umem->props.chunk_mask; + + nta = rx_ring->next_to_alloc; + bi = &rx_ring->rx_bi[nta]; + + nta++; + rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; + + handle &= mask; + + bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle); + bi->dma += hr; + + bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle); + bi->addr += hr; + + bi->handle = (u64)handle + rx_ring->xsk_umem->headroom; +} + +/** + * i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer + * @rx_ring: Rx ring + * @bi: Rx buffer + * @xdp: xdp_buff + * + * This functions allocates a new skb from a zero-copy Rx buffer. + * + * Returns the skb, or NULL on failure. + **/ +static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, + struct i40e_rx_buffer *bi, + struct xdp_buff *xdp) +{ + unsigned int metasize = xdp->data - xdp->data_meta; + unsigned int datasize = xdp->data_end - xdp->data; + struct sk_buff *skb; + + /* allocate a skb to store the frags */ + skb = __napi_alloc_skb(&rx_ring->q_vector->napi, + xdp->data_end - xdp->data_hard_start, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + memcpy(__skb_put(skb, datasize), xdp->data, datasize); + if (metasize) + skb_metadata_set(skb, metasize); + + i40e_reuse_rx_buffer_zc(rx_ring, bi); + return skb; +} + +/** + * i40e_inc_ntc: Advance the next_to_clean index + * @rx_ring: Rx ring + **/ +static void i40e_inc_ntc(struct i40e_ring *rx_ring) +{ + u32 ntc = rx_ring->next_to_clean + 1; + + ntc = (ntc < rx_ring->count) ? ntc : 0; + rx_ring->next_to_clean = ntc; + prefetch(I40E_RX_DESC(rx_ring, ntc)); +} + +/** + * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring + * @rx_ring: Rx ring + * @budget: NAPI budget + * + * Returns amount of work completed + **/ +int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) +{ + unsigned int total_rx_bytes = 0, total_rx_packets = 0; + u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); + unsigned int xdp_res, xdp_xmit = 0; + bool failure = false; + struct sk_buff *skb; + struct xdp_buff xdp; + + xdp.rxq = &rx_ring->xdp_rxq; + + while (likely(total_rx_packets < (unsigned int)budget)) { + struct i40e_rx_buffer *bi; + union i40e_rx_desc *rx_desc; + unsigned int size; + u16 vlan_tag; + u8 rx_ptype; + u64 qword; + + if (cleaned_count >= I40E_RX_BUFFER_WRITE) { + failure = failure || + !i40e_alloc_rx_buffers_zc(rx_ring, + cleaned_count); + cleaned_count = 0; + } + + rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean); + qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); + + /* This memory barrier is needed to keep us from reading + * any other fields out of the rx_desc until we have + * verified the descriptor has been written back. + */ + dma_rmb(); + + bi = i40e_clean_programming_status(rx_ring, rx_desc, + qword); + if (unlikely(bi)) { + i40e_reuse_rx_buffer_zc(rx_ring, bi); + cleaned_count++; + continue; + } + + size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> + I40E_RXD_QW1_LENGTH_PBUF_SHIFT; + if (!size) + break; + + bi = i40e_get_rx_buffer_zc(rx_ring, size); + xdp.data = bi->addr; + xdp.data_meta = xdp.data; + xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; + xdp.data_end = xdp.data + size; + xdp.handle = bi->handle; + + xdp_res = i40e_run_xdp_zc(rx_ring, &xdp); + if (xdp_res) { + if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) { + xdp_xmit |= xdp_res; + bi->addr = NULL; + } else { + i40e_reuse_rx_buffer_zc(rx_ring, bi); + } + + total_rx_bytes += size; + total_rx_packets++; + + cleaned_count++; + i40e_inc_ntc(rx_ring); + continue; + } + + /* XDP_PASS path */ + + /* NB! We are not checking for errors using + * i40e_test_staterr with + * BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that + * SBP is *not* set in PRT_SBPVSI (default not set). + */ + skb = i40e_construct_skb_zc(rx_ring, bi, &xdp); + if (!skb) { + rx_ring->rx_stats.alloc_buff_failed++; + break; + } + + cleaned_count++; + i40e_inc_ntc(rx_ring); + + if (eth_skb_pad(skb)) + continue; + + total_rx_bytes += skb->len; + total_rx_packets++; + + qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); + rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> + I40E_RXD_QW1_PTYPE_SHIFT; + i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype); + + vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ? + le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0; + i40e_receive_skb(rx_ring, skb, vlan_tag); + } + + i40e_finalize_xdp_rx(rx_ring, xdp_xmit); + i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets); + return failure ? budget : (int)total_rx_packets; +} + diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h new file mode 100644 index 0000000000000..427a844f78a7c --- /dev/null +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2018 Intel Corporation. */ + +#ifndef _I40E_XSK_H_ +#define _I40E_XSK_H_ + +struct i40e_vsi; +struct xdp_umem; +struct zero_copy_allocator; + +int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair); +int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair); +int i40e_xsk_umem_query(struct i40e_vsi *vsi, struct xdp_umem **umem, + u16 qid); +int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, + u16 qid); +void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle); +bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count); +int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget); + +#endif /* _I40E_XSK_H_ */ From a96e7472732842d76af0132c5f87f531f312d7a9 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 28 Aug 2018 14:44:33 +0200 Subject: [PATCH 12/28] i40e: move common Tx functions to i40e_txrx_common.h This patch prepares for the upcoming zero-copy Tx functionality, by moving common functions and refactor chunks of code into re-usable functions, used both by the regular path and zero-copy path. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 35 +---------- .../ethernet/intel/i40e/i40e_txrx_common.h | 59 +++++++++++++++++++ 2 files changed, 61 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 2c4d179ffebfd..11e201fcb57a6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -11,16 +11,6 @@ #include "i40e_txrx_common.h" #include "i40e_xsk.h" -static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size, - u32 td_tag) -{ - return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA | - ((u64)td_cmd << I40E_TXD_QW1_CMD_SHIFT) | - ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) | - ((u64)size << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) | - ((u64)td_tag << I40E_TXD_QW1_L2TAG1_SHIFT)); -} - #define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS) /** * i40e_fdir - Generate a Flow Director descriptor based on fdata @@ -769,8 +759,6 @@ void i40e_detect_recover_hung(struct i40e_vsi *vsi) } } -#define WB_STRIDE 4 - /** * i40e_clean_tx_irq - Reclaim resources after transmit completes * @vsi: the VSI we care about @@ -875,27 +863,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi, i += tx_ring->count; tx_ring->next_to_clean = i; - u64_stats_update_begin(&tx_ring->syncp); - tx_ring->stats.bytes += total_bytes; - tx_ring->stats.packets += total_packets; - u64_stats_update_end(&tx_ring->syncp); - tx_ring->q_vector->tx.total_bytes += total_bytes; - tx_ring->q_vector->tx.total_packets += total_packets; - - if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) { - /* check to see if there are < 4 descriptors - * waiting to be written back, then kick the hardware to force - * them to be written back in case we stay in NAPI. - * In this mode on X722 we do not enable Interrupt. - */ - unsigned int j = i40e_get_tx_pending(tx_ring, false); - - if (budget && - ((j / WB_STRIDE) == 0) && (j > 0) && - !test_bit(__I40E_VSI_DOWN, vsi->state) && - (I40E_DESC_UNUSED(tx_ring) != tx_ring->count)) - tx_ring->arm_wb = true; - } + i40e_update_tx_stats(tx_ring, total_packets, total_bytes); + i40e_arm_wb(tx_ring, vsi, budget); if (ring_is_xdp(tx_ring)) return !!budget; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h index 2bd5187fcd669..b5afd479a9c56 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h @@ -28,4 +28,63 @@ void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val); #define I40E_XDP_TX BIT(1) #define I40E_XDP_REDIR BIT(2) +/** + * build_ctob - Builds the Tx descriptor (cmd, offset and type) qword + **/ +static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size, + u32 td_tag) +{ + return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA | + ((u64)td_cmd << I40E_TXD_QW1_CMD_SHIFT) | + ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) | + ((u64)size << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) | + ((u64)td_tag << I40E_TXD_QW1_L2TAG1_SHIFT)); +} + +/** + * i40e_update_tx_stats - Update the egress statistics for the Tx ring + * @tx_ring: Tx ring to update + * @total_packet: total packets sent + * @total_bytes: total bytes sent + **/ +static inline void i40e_update_tx_stats(struct i40e_ring *tx_ring, + unsigned int total_packets, + unsigned int total_bytes) +{ + u64_stats_update_begin(&tx_ring->syncp); + tx_ring->stats.bytes += total_bytes; + tx_ring->stats.packets += total_packets; + u64_stats_update_end(&tx_ring->syncp); + tx_ring->q_vector->tx.total_bytes += total_bytes; + tx_ring->q_vector->tx.total_packets += total_packets; +} + +#define WB_STRIDE 4 + +/** + * i40e_arm_wb - (Possibly) arms Tx write-back + * @tx_ring: Tx ring to update + * @vsi: the VSI + * @budget: the NAPI budget left + **/ +static inline void i40e_arm_wb(struct i40e_ring *tx_ring, + struct i40e_vsi *vsi, + int budget) +{ + if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) { + /* check to see if there are < 4 descriptors + * waiting to be written back, then kick the hardware to force + * them to be written back in case we stay in NAPI. + * In this mode on X722 we do not enable Interrupt. + */ + unsigned int j = i40e_get_tx_pending(tx_ring, false); + + if (budget && + ((j / WB_STRIDE) == 0) && j > 0 && + !test_bit(__I40E_VSI_DOWN, vsi->state) && + (I40E_DESC_UNUSED(tx_ring) != tx_ring->count)) + tx_ring->arm_wb = true; + } +} + #endif /* I40E_TXRX_COMMON_ */ From 1328dcddbd53900481994f4c4d8b79e64477965e Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 28 Aug 2018 14:44:34 +0200 Subject: [PATCH 13/28] i40e: add AF_XDP zero-copy Tx support This patch adds zero-copy Tx support for AF_XDP sockets. It implements the ndo_xsk_async_xmit netdev ndo and performs all the Tx logic from a NAPI context. This means pulling egress packets from the Tx ring, placing the frames on the NIC HW descriptor ring and completing sent frames back to the application via the completion ring. The regular XDP Tx ring is used for AF_XDP as well. This rationale for this is as follows: XDP_REDIRECT guarantees mutual exclusion between different NAPI contexts based on CPU id. In other words, a netdev can XDP_REDIRECT to another netdev with a different NAPI context, since the operation is bound to a specific core and each core has its own hardware ring. As the AF_XDP Tx action is running in the same NAPI context and using the same ring, it will also be protected from XDP_REDIRECT actions with the exact same mechanism. As with AF_XDP Rx, all AF_XDP Tx specific functions are added to i40e_xsk.c. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_main.c | 4 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 6 +- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 173 ++++++++++++++++++++ drivers/net/ethernet/intel/i40e/i40e_xsk.h | 4 + 4 files changed, 186 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 848eea7c84dbd..5da7eb0fe4ae5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3074,6 +3074,9 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring) i40e_status err = 0; u32 qtx_ctl = 0; + if (ring_is_xdp(ring)) + ring->xsk_umem = i40e_xsk_umem(ring); + /* some ATR related tx ring init */ if (vsi->back->flags & I40E_FLAG_FD_ATR_ENABLED) { ring->atr_sample_rate = vsi->back->atr_sample_rate; @@ -12185,6 +12188,7 @@ static const struct net_device_ops i40e_netdev_ops = { .ndo_bridge_setlink = i40e_ndo_bridge_setlink, .ndo_bpf = i40e_xdp, .ndo_xdp_xmit = i40e_xdp_xmit, + .ndo_xsk_async_xmit = i40e_xsk_async_xmit, }; /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 11e201fcb57a6..37bd4e50ccde0 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2597,7 +2597,11 @@ int i40e_napi_poll(struct napi_struct *napi, int budget) * budget and be more aggressive about cleaning up the Tx descriptors. */ i40e_for_each_ring(ring, q_vector->tx) { - if (!i40e_clean_tx_irq(vsi, ring, budget)) { + bool wd = ring->xsk_umem ? + i40e_clean_xdp_tx_irq(vsi, ring, budget) : + i40e_clean_tx_irq(vsi, ring, budget); + + if (!wd) { clean_complete = false; continue; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index bf502f2307c25..94947a826bc3a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -659,3 +659,176 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) return failure ? budget : (int)total_rx_packets; } +/** + * i40e_xmit_zc - Performs zero-copy Tx AF_XDP + * @xdp_ring: XDP Tx ring + * @budget: NAPI budget + * + * Returns true if the work is finished. + **/ +static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) +{ + unsigned int total_packets = 0; + struct i40e_tx_buffer *tx_bi; + struct i40e_tx_desc *tx_desc; + bool work_done = true; + dma_addr_t dma; + u32 len; + + while (budget-- > 0) { + if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) { + xdp_ring->tx_stats.tx_busy++; + work_done = false; + break; + } + + if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len)) + break; + + dma_sync_single_for_device(xdp_ring->dev, dma, len, + DMA_BIDIRECTIONAL); + + tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use]; + tx_bi->bytecount = len; + + tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use); + tx_desc->buffer_addr = cpu_to_le64(dma); + tx_desc->cmd_type_offset_bsz = + build_ctob(I40E_TX_DESC_CMD_ICRC + | I40E_TX_DESC_CMD_EOP, + 0, len, 0); + total_packets++; + + xdp_ring->next_to_use++; + if (xdp_ring->next_to_use == xdp_ring->count) + xdp_ring->next_to_use = 0; + } + + if (total_packets > 0) { + /* Request an interrupt for the last frame and bump tail ptr. */ + tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << + I40E_TXD_QW1_CMD_SHIFT); + i40e_xdp_ring_update_tail(xdp_ring); + + xsk_umem_consume_tx_done(xdp_ring->xsk_umem); + } + + return !!budget && work_done; +} + +/** + * i40e_clean_xdp_tx_buffer - Frees and unmaps an XDP Tx entry + * @tx_ring: XDP Tx ring + * @tx_bi: Tx buffer info to clean + **/ +static void i40e_clean_xdp_tx_buffer(struct i40e_ring *tx_ring, + struct i40e_tx_buffer *tx_bi) +{ + xdp_return_frame(tx_bi->xdpf); + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(tx_bi, dma), + dma_unmap_len(tx_bi, len), DMA_TO_DEVICE); + dma_unmap_len_set(tx_bi, len, 0); +} + +/** + * i40e_clean_xdp_tx_irq - Completes AF_XDP entries, and cleans XDP entries + * @tx_ring: XDP Tx ring + * @tx_bi: Tx buffer info to clean + * + * Returns true if cleanup/tranmission is done. + **/ +bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi, + struct i40e_ring *tx_ring, int napi_budget) +{ + unsigned int ntc, total_bytes = 0, budget = vsi->work_limit; + u32 i, completed_frames, frames_ready, xsk_frames = 0; + struct xdp_umem *umem = tx_ring->xsk_umem; + u32 head_idx = i40e_get_head(tx_ring); + bool work_done = true, xmit_done; + struct i40e_tx_buffer *tx_bi; + + if (head_idx < tx_ring->next_to_clean) + head_idx += tx_ring->count; + frames_ready = head_idx - tx_ring->next_to_clean; + + if (frames_ready == 0) { + goto out_xmit; + } else if (frames_ready > budget) { + completed_frames = budget; + work_done = false; + } else { + completed_frames = frames_ready; + } + + ntc = tx_ring->next_to_clean; + + for (i = 0; i < completed_frames; i++) { + tx_bi = &tx_ring->tx_bi[ntc]; + + if (tx_bi->xdpf) + i40e_clean_xdp_tx_buffer(tx_ring, tx_bi); + else + xsk_frames++; + + tx_bi->xdpf = NULL; + total_bytes += tx_bi->bytecount; + + if (++ntc >= tx_ring->count) + ntc = 0; + } + + tx_ring->next_to_clean += completed_frames; + if (unlikely(tx_ring->next_to_clean >= tx_ring->count)) + tx_ring->next_to_clean -= tx_ring->count; + + if (xsk_frames) + xsk_umem_complete_tx(umem, xsk_frames); + + i40e_arm_wb(tx_ring, vsi, budget); + i40e_update_tx_stats(tx_ring, completed_frames, total_bytes); + +out_xmit: + xmit_done = i40e_xmit_zc(tx_ring, budget); + + return work_done && xmit_done; +} + +/** + * i40e_xsk_async_xmit - Implements the ndo_xsk_async_xmit + * @dev: the netdevice + * @queue_id: queue id to wake up + * + * Returns <0 for errors, 0 otherwise. + **/ +int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id) +{ + struct i40e_netdev_priv *np = netdev_priv(dev); + struct i40e_vsi *vsi = np->vsi; + struct i40e_ring *ring; + + if (test_bit(__I40E_VSI_DOWN, vsi->state)) + return -ENETDOWN; + + if (!i40e_enabled_xdp_vsi(vsi)) + return -ENXIO; + + if (queue_id >= vsi->num_queue_pairs) + return -ENXIO; + + if (!vsi->xdp_rings[queue_id]->xsk_umem) + return -ENXIO; + + ring = vsi->xdp_rings[queue_id]; + + /* The idea here is that if NAPI is running, mark a miss, so + * it will run again. If not, trigger an interrupt and + * schedule the NAPI from interrupt context. If NAPI would be + * scheduled here, the interrupt affinity would not be + * honored. + */ + if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) + i40e_force_wb(vsi, ring->q_vector); + + return 0; +} diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index 427a844f78a7c..9038c5d5cf083 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -18,4 +18,8 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle); bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count); int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget); +bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi, + struct i40e_ring *tx_ring, int napi_budget); +int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id); + #endif /* _I40E_XSK_H_ */ From 58c50ae4a0b638ebbcdddf03cfa4fd36f0edeb02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:35 +0200 Subject: [PATCH 14/28] samples/bpf: add -c/--copy -z/--zero-copy flags to xdpsock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The -c/--copy -z/--zero-copy flags enforces either copy or zero-copy mode. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- samples/bpf/xdpsock_user.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 4914788b67274..b3906111bedb6 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -649,6 +649,8 @@ static struct option long_options[] = { {"xdp-skb", no_argument, 0, 'S'}, {"xdp-native", no_argument, 0, 'N'}, {"interval", required_argument, 0, 'n'}, + {"zero-copy", no_argument, 0, 'z'}, + {"copy", no_argument, 0, 'c'}, {0, 0, 0, 0} }; @@ -667,6 +669,8 @@ static void usage(const char *prog) " -S, --xdp-skb=n Use XDP skb-mod\n" " -N, --xdp-native=n Enfore XDP native mode\n" " -n, --interval=n Specify statistics update interval (default 1 sec).\n" + " -z, --zero-copy Force zero-copy mode.\n" + " -c, --copy Force copy mode.\n" "\n"; fprintf(stderr, str, prog); exit(EXIT_FAILURE); @@ -679,7 +683,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options, + c = getopt_long(argc, argv, "rtli:q:psSNn:cz", long_options, &option_index); if (c == -1) break; @@ -716,6 +720,12 @@ static void parse_command_line(int argc, char **argv) case 'n': opt_interval = atoi(optarg); break; + case 'z': + opt_xdp_bind_flags |= XDP_ZEROCOPY; + break; + case 'c': + opt_xdp_bind_flags |= XDP_COPY; + break; default: usage(basename(argv[0])); } From 679c782de14bd48c19dd74cd1af20a2bc05dd936 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 22 Aug 2018 20:02:19 +0100 Subject: [PATCH 15/28] bpf/verifier: per-register parent pointers By giving each register its own liveness chain, we elide the skip_callee() logic. Instead, each register's parent is the state it inherits from; both check_func_call() and prepare_func_exit() automatically connect reg states to the correct chain since when they copy the reg state across (r1-r5 into the callee as args, and r0 out as the return value) they also copy the parent pointer. Signed-off-by: Edward Cree Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 8 +- kernel/bpf/verifier.c | 184 +++++++++-------------------------- 2 files changed, 47 insertions(+), 145 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 38b04f559ad37..b42b60a83e194 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -41,6 +41,7 @@ enum bpf_reg_liveness { }; struct bpf_reg_state { + /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; union { /* valid when type == PTR_TO_PACKET */ @@ -59,7 +60,6 @@ struct bpf_reg_state { * came from, when one is tested for != NULL. */ u32 id; - /* Ordering of fields matters. See states_equal() */ /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, this represents the variable part of the offset @@ -76,15 +76,15 @@ struct bpf_reg_state { s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ + /* parentage chain for liveness checking */ + struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' * is used which is an index in bpf_verifier_state->frame[] array * pointing to bpf_func_state. - * This field must be second to last, for states_equal() reasons. */ u32 frameno; - /* This field must be last, for states_equal() reasons. */ enum bpf_reg_liveness live; }; @@ -107,7 +107,6 @@ struct bpf_stack_state { */ struct bpf_func_state { struct bpf_reg_state regs[MAX_BPF_REG]; - struct bpf_verifier_state *parent; /* index of call instruction that called into this func */ int callsite; /* stack frame number of this function state from pov of @@ -129,7 +128,6 @@ struct bpf_func_state { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; - struct bpf_verifier_state *parent; u32 curframe; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92246117d2b03..68568d22d6bd0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -380,9 +380,9 @@ static int copy_stack_state(struct bpf_func_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which this function copies over. It points to previous bpf_verifier_state - * which is never reallocated + * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state + * which this function copies over. It points to corresponding reg in previous + * bpf_verifier_state which is never reallocated */ static int realloc_func_state(struct bpf_func_state *state, int size, bool copy_old) @@ -466,7 +466,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->frame[i] = NULL; } dst_state->curframe = src->curframe; - dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -732,6 +731,7 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; + regs[i].parent = NULL; } /* frame pointer */ @@ -876,74 +876,21 @@ static int check_subprogs(struct bpf_verifier_env *env) return 0; } -static -struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) -{ - struct bpf_verifier_state *tmp = NULL; - - /* 'parent' could be a state of caller and - * 'state' could be a state of callee. In such case - * parent->curframe < state->curframe - * and it's ok for r1 - r5 registers - * - * 'parent' could be a callee's state after it bpf_exit-ed. - * In such case parent->curframe > state->curframe - * and it's ok for r0 only - */ - if (parent->curframe == state->curframe || - (parent->curframe < state->curframe && - regno >= BPF_REG_1 && regno <= BPF_REG_5) || - (parent->curframe > state->curframe && - regno == BPF_REG_0)) - return parent; - - if (parent->curframe > state->curframe && - regno >= BPF_REG_6) { - /* for callee saved regs we have to skip the whole chain - * of states that belong to callee and mark as LIVE_READ - * the registers before the call - */ - tmp = parent; - while (tmp && tmp->curframe != state->curframe) { - tmp = tmp->parent; - } - if (!tmp) - goto bug; - parent = tmp; - } else { - goto bug; - } - return parent; -bug: - verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); - verbose(env, "regno %d parent frame %d current frame %d\n", - regno, parent->curframe, state->curframe); - return NULL; -} - +/* Parentage chain of this register (or stack slot) should take care of all + * issues like callee-saved registers, stack slot allocation time, etc. + */ static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) + const struct bpf_reg_state *state, + struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ - if (regno == BPF_REG_FP) - /* We don't need to worry about FP liveness because it's read-only */ - return 0; - while (parent) { /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->live & REG_LIVE_WRITTEN) break; - parent = skip_callee(env, state, parent, regno); - if (!parent) - return -EFAULT; /* ... then we depend on parent's value */ - parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; + parent->live |= REG_LIVE_READ; state = parent; parent = state->parent; writes = true; @@ -969,7 +916,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - return mark_reg_read(env, vstate, vstate->parent, regno); + /* We don't need to worry about FP liveness because it's read-only */ + if (regno != BPF_REG_FP) + return mark_reg_read(env, ®s[regno], + regs[regno].parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1080,8 +1030,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } else { u8 type = STACK_MISC; - /* regular write of data into stack */ - state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* regular write of data into stack destroys any spilled ptr */ + state->stack[spi].spilled_ptr.type = NOT_INIT; /* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1106,61 +1056,6 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -/* registers of every function are unique and mark_reg_read() propagates - * the liveness in the following cases: - * - from callee into caller for R1 - R5 that were used as arguments - * - from caller into callee for R0 that used as result of the call - * - from caller to the same caller skipping states of the callee for R6 - R9, - * since R6 - R9 are callee saved by implicit function prologue and - * caller's R6 != callee's R6, so when we propagate liveness up to - * parent states we need to skip callee states for R6 - R9. - * - * stack slot marking is different, since stacks of caller and callee are - * accessible in both (since caller can pass a pointer to caller's stack to - * callee which can pass it to another function), hence mark_stack_slot_read() - * has to propagate the stack liveness to all parent states at given frame number. - * Consider code: - * f1() { - * ptr = fp - 8; - * *ptr = ctx; - * call f2 { - * .. = *ptr; - * } - * .. = *ptr; - * } - * First *ptr is reading from f1's stack and mark_stack_slot_read() has - * to mark liveness at the f1's frame and not f2's frame. - * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has - * to propagate liveness to f2 states at f1's frame level and further into - * f1 states at f1's frame level until write into that stack slot - */ -static void mark_stack_slot_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - int slot, int frameno) -{ - bool writes = parent == state->parent; /* Observe write marks */ - - while (parent) { - if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) - /* since LIVE_WRITTEN mark is only done for full 8-byte - * write the read marks are conservative and parent - * state may not even have the stack allocated. In such case - * end the propagation, since the loop reached beginning - * of the function - */ - break; - /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) - break; - /* ... then we depend on parent's value */ - parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; - state = parent; - parent = state->parent; - writes = true; - } -} - static int check_stack_read(struct bpf_verifier_env *env, struct bpf_func_state *reg_state /* func where register points to */, int off, int size, int value_regno) @@ -1198,8 +1093,8 @@ static int check_stack_read(struct bpf_verifier_env *env, */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); return 0; } else { int zeros = 0; @@ -1215,8 +1110,8 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1908,8 +1803,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, - spi, state->frameno); + mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent); } return update_stack_depth(env, state, off); } @@ -2366,11 +2261,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */); - /* copy r1 - r5 args that callee can access */ + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call regsiters r0 - r5 were scratched */ + /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, caller->regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); @@ -4370,7 +4267,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, /* explored state didn't use this */ return true; - equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; if (rold->type == PTR_TO_STACK) /* two stack pointers are equal only if they're pointing to @@ -4603,7 +4500,7 @@ static bool states_equal(struct bpf_verifier_env *env, * equivalent state (jump target or such) we didn't arrive by the straight-line * code, so read marks in the state must propagate to the parent regardless * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() and mark_stack_slot_read() is for. + * in mark_reg_read() is for. */ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, @@ -4624,7 +4521,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, vstate, vparent, i); + err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], + &vparent->frame[vstate->curframe]->regs[i]); if (err) return err; } @@ -4639,7 +4537,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_stack_slot_read(env, vstate, vparent, i, frame); + mark_reg_read(env, &state->stack[i].spilled_ptr, + &parent->stack[i].spilled_ptr); } } return err; @@ -4649,7 +4548,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err; sl = env->explored_states[insn_idx]; @@ -4691,16 +4590,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; /* add new state to the head of linked list */ - err = copy_verifier_state(&new_sl->state, cur); + new = &new_sl->state; + err = copy_verifier_state(new, cur); if (err) { - free_verifier_state(&new_sl->state, false); + free_verifier_state(new, false); kfree(new_sl); return err; } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - cur->parent = &new_sl->state; + for (i = 0; i < BPF_REG_FP; i++) + cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -4713,9 +4614,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { struct bpf_func_state *frame = cur->frame[j]; + struct bpf_func_state *newframe = new->frame[j]; - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.parent = + &newframe->stack[i].spilled_ptr; + } } return 0; } @@ -4734,7 +4639,6 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; - state->parent = NULL; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); From 8efea21d333d21e1f9177579ffdc69556314f603 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 22 Aug 2018 20:02:44 +0100 Subject: [PATCH 16/28] bpf/verifier: display non-spill stack slot types in print_verifier_state If a stack slot does not hold a spilled register (STACK_SPILL), then each of its eight bytes could potentially have a different slot_type. This information can be important for debugging, and previously we either did not print anything for the stack slot, or just printed fp-X=0 in the case where its first byte was STACK_ZERO. Instead, print eight characters with either 0 (STACK_ZERO), m (STACK_MISC) or ? (STACK_INVALID) for any stack slot which is neither STACK_SPILL nor entirely STACK_INVALID. Signed-off-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68568d22d6bd0..f4ff0c569e545 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -263,6 +263,13 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', +}; + static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { @@ -349,15 +356,26 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) { - verbose(env, " fp%d", - (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + int j; + + for (j = 0; j < BPF_REG_SIZE; j++) { + if (state->stack[i].slot_type[j] != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[ + state->stack[i].slot_type[j]]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); - } - if (state->stack[i].slot_type[0] == STACK_ZERO) - verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); + else + verbose(env, "=%s", types_buf); } verbose(env, "\n"); } From c7b27c37af3da5a63f32b0bc99569e3069e4d9c1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 29 Aug 2018 14:43:13 -0700 Subject: [PATCH 17/28] bpf: add bpffs pretty print for percpu arraymap/hash/lru_hash Added bpffs pretty print for percpu arraymap, percpu hashmap and percpu lru hashmap. For each map pair, the format is: : { cpu0: cpu1: ... cpun: } For example, on my VM, there are 4 cpus, and for test_btf test in the next patch: cat /sys/fs/bpf/pprint_test_percpu_hash You may get: ... 43602: { cpu0: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu1: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu2: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu3: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} } 72847: { cpu0: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu1: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu2: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu3: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} } ... Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 24 ++++++++++++++++++++++++ kernel/bpf/hashtab.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0c17aab3ce5f8..f9d24121be991 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,6 +358,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + seq_printf(m, "%u: {\n", *(u32 *)key); + pptr = array->pptrs[index & array->index_mask]; + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) @@ -398,6 +421,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 03cc59ee9c953..2c17902881387 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1285,6 +1285,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; } +static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct htab_elem *l; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + l = __htab_map_lookup_elem(map, key); + if (!l) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": {\n"); + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1293,6 +1322,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_elem = htab_percpu_map_lookup_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1303,6 +1333,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_elem = htab_lru_percpu_map_lookup_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) From 6493ebf7242d9b9a2c50db91b9baeb0543990736 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 29 Aug 2018 14:43:14 -0700 Subject: [PATCH 18/28] tools/bpf: add bpffs percpu map pretty print tests in test_btf The bpf selftest test_btf is extended to test bpffs percpu map pretty print for percpu array, percpu hash and percpu lru hash. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_btf.c | 179 ++++++++++++++++++++----- 1 file changed, 144 insertions(+), 35 deletions(-) diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 6b5cfeb7a9cc3..f42b3396d6226 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -45,7 +46,6 @@ static int count_result(int err) return err; } -#define min(a, b) ((a) < (b) ? (a) : (b)) #define __printf(a, b) __attribute__((format(printf, a, b))) __printf(1, 2) @@ -130,6 +130,7 @@ struct btf_raw_test { bool map_create_err; bool ordered_map; bool lossless_map; + bool percpu_map; int hdr_len_delta; int type_off_delta; int str_off_delta; @@ -2157,6 +2158,7 @@ static struct btf_pprint_test_meta { const char *map_name; bool ordered_map; bool lossless_map; + bool percpu_map; } pprint_tests_meta[] = { { .descr = "BTF pretty print array", @@ -2164,6 +2166,7 @@ static struct btf_pprint_test_meta { .map_name = "pprint_test_array", .ordered_map = true, .lossless_map = true, + .percpu_map = false, }, { @@ -2172,6 +2175,7 @@ static struct btf_pprint_test_meta { .map_name = "pprint_test_hash", .ordered_map = false, .lossless_map = true, + .percpu_map = false, }, { @@ -2180,30 +2184,83 @@ static struct btf_pprint_test_meta { .map_name = "pprint_test_lru_hash", .ordered_map = false, .lossless_map = false, + .percpu_map = false, +}, + +{ + .descr = "BTF pretty print percpu array", + .map_type = BPF_MAP_TYPE_PERCPU_ARRAY, + .map_name = "pprint_test_percpu_array", + .ordered_map = true, + .lossless_map = true, + .percpu_map = true, +}, + +{ + .descr = "BTF pretty print percpu hash", + .map_type = BPF_MAP_TYPE_PERCPU_HASH, + .map_name = "pprint_test_percpu_hash", + .ordered_map = false, + .lossless_map = true, + .percpu_map = true, +}, + +{ + .descr = "BTF pretty print lru percpu hash", + .map_type = BPF_MAP_TYPE_LRU_PERCPU_HASH, + .map_name = "pprint_test_lru_percpu_hash", + .ordered_map = false, + .lossless_map = false, + .percpu_map = true, }, }; -static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i) +static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i, + int num_cpus, int rounded_value_size) { - v->ui32 = i; - v->si32 = -i; - v->unused_bits2a = 3; - v->bits28 = i; - v->unused_bits2b = 3; - v->ui64 = i; - v->aenum = i & 0x03; + int cpu; + + for (cpu = 0; cpu < num_cpus; cpu++) { + v->ui32 = i + cpu; + v->si32 = -i; + v->unused_bits2a = 3; + v->bits28 = i; + v->unused_bits2b = 3; + v->ui64 = i; + v->aenum = i & 0x03; + v = (void *)v + rounded_value_size; + } } +static int check_line(const char *expected_line, int nexpected_line, + int expected_line_len, const char *line) +{ + if (CHECK(nexpected_line == expected_line_len, + "expected_line is too long")) + return -1; + + if (strcmp(expected_line, line)) { + fprintf(stderr, "unexpected pprint output\n"); + fprintf(stderr, "expected: %s", expected_line); + fprintf(stderr, " read: %s", line); + return -1; + } + + return 0; +} + + static int do_test_pprint(void) { const struct btf_raw_test *test = &pprint_test_template; struct bpf_create_map_attr create_attr = {}; + bool ordered_map, lossless_map, percpu_map; + int err, ret, num_cpus, rounded_value_size; + struct pprint_mapv *mapv = NULL; unsigned int key, nr_read_elems; - bool ordered_map, lossless_map; int map_fd = -1, btf_fd = -1; - struct pprint_mapv mapv = {}; unsigned int raw_btf_size; char expected_line[255]; FILE *pin_file = NULL; @@ -2212,7 +2269,6 @@ static int do_test_pprint(void) char *line = NULL; uint8_t *raw_btf; ssize_t nread; - int err, ret; fprintf(stderr, "%s......", test->descr); raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, @@ -2261,9 +2317,18 @@ static int do_test_pprint(void) if (CHECK(err, "bpf_obj_pin(%s): errno:%d.", pin_path, errno)) goto done; + percpu_map = test->percpu_map; + num_cpus = percpu_map ? bpf_num_possible_cpus() : 1; + rounded_value_size = round_up(sizeof(struct pprint_mapv), 8); + mapv = calloc(num_cpus, rounded_value_size); + if (CHECK(!mapv, "mapv allocation failure")) { + err = -1; + goto done; + } + for (key = 0; key < test->max_entries; key++) { - set_pprint_mapv(&mapv, key); - bpf_map_update_elem(map_fd, &key, &mapv, 0); + set_pprint_mapv(mapv, key, num_cpus, rounded_value_size); + bpf_map_update_elem(map_fd, &key, mapv, 0); } pin_file = fopen(pin_path, "r"); @@ -2286,33 +2351,74 @@ static int do_test_pprint(void) ordered_map = test->ordered_map; lossless_map = test->lossless_map; do { + struct pprint_mapv *cmapv; ssize_t nexpected_line; unsigned int next_key; + int cpu; next_key = ordered_map ? nr_read_elems : atoi(line); - set_pprint_mapv(&mapv, next_key); - nexpected_line = snprintf(expected_line, sizeof(expected_line), - "%u: {%u,0,%d,0x%x,0x%x,0x%x,{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n", - next_key, - mapv.ui32, mapv.si32, - mapv.unused_bits2a, mapv.bits28, mapv.unused_bits2b, - mapv.ui64, - mapv.ui8a[0], mapv.ui8a[1], mapv.ui8a[2], mapv.ui8a[3], - mapv.ui8a[4], mapv.ui8a[5], mapv.ui8a[6], mapv.ui8a[7], - pprint_enum_str[mapv.aenum]); - - if (CHECK(nexpected_line == sizeof(expected_line), - "expected_line is too long")) { - err = -1; - goto done; + set_pprint_mapv(mapv, next_key, num_cpus, rounded_value_size); + cmapv = mapv; + + for (cpu = 0; cpu < num_cpus; cpu++) { + if (percpu_map) { + /* for percpu map, the format looks like: + * : { + * cpu0: + * cpu1: + * ... + * cpun: + * } + * + * let us verify the line containing the key here. + */ + if (cpu == 0) { + nexpected_line = snprintf(expected_line, + sizeof(expected_line), + "%u: {\n", + next_key); + + err = check_line(expected_line, nexpected_line, + sizeof(expected_line), line); + if (err == -1) + goto done; + } + + /* read value@cpu */ + nread = getline(&line, &line_len, pin_file); + if (nread < 0) + break; + } + + nexpected_line = snprintf(expected_line, sizeof(expected_line), + "%s%u: {%u,0,%d,0x%x,0x%x,0x%x," + "{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n", + percpu_map ? "\tcpu" : "", + percpu_map ? cpu : next_key, + cmapv->ui32, cmapv->si32, + cmapv->unused_bits2a, + cmapv->bits28, + cmapv->unused_bits2b, + cmapv->ui64, + cmapv->ui8a[0], cmapv->ui8a[1], + cmapv->ui8a[2], cmapv->ui8a[3], + cmapv->ui8a[4], cmapv->ui8a[5], + cmapv->ui8a[6], cmapv->ui8a[7], + pprint_enum_str[cmapv->aenum]); + + err = check_line(expected_line, nexpected_line, + sizeof(expected_line), line); + if (err == -1) + goto done; + + cmapv = (void *)cmapv + rounded_value_size; } - if (strcmp(expected_line, line)) { - err = -1; - fprintf(stderr, "unexpected pprint output\n"); - fprintf(stderr, "expected: %s", expected_line); - fprintf(stderr, " read: %s", line); - goto done; + if (percpu_map) { + /* skip the last bracket for the percpu map */ + nread = getline(&line, &line_len, pin_file); + if (nread < 0) + break; } nread = getline(&line, &line_len, pin_file); @@ -2334,6 +2440,8 @@ static int do_test_pprint(void) err = 0; done: + if (mapv) + free(mapv); if (!err) fprintf(stderr, "OK"); if (*btf_log_buf && (err || args.always_log)) @@ -2361,6 +2469,7 @@ static int test_pprint(void) pprint_test_template.map_name = pprint_tests_meta[i].map_name; pprint_test_template.ordered_map = pprint_tests_meta[i].ordered_map; pprint_test_template.lossless_map = pprint_tests_meta[i].lossless_map; + pprint_test_template.percpu_map = pprint_tests_meta[i].percpu_map; err |= count_result(do_test_pprint()); } From 1a86ad89da1c06b2a326953309dcb99f0d079a32 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 29 Aug 2018 14:43:15 -0700 Subject: [PATCH 19/28] tools/bpf: bpftool: add btf percpu map formated dump The btf pretty print is added to percpu arraymap, percpu hashmap and percpu lru hashmap. For each pair, the following will be added to plain/json output: { "key": , "values": [{ "cpu": 0, "value": },{ "cpu": 1, "value": },{ .... },{ "cpu": n, "value": } ] } For example, the following could be part of plain or json formatted output: { "key": 0, "values": [{ "cpu": 0, "value": { "ui32": 0, "ui16": 0, } },{ "cpu": 1, "value": { "ui32": 1, "ui16": 0, } },{ "cpu": 2, "value": { "ui32": 2, "ui16": 0, } },{ "cpu": 3, "value": { "ui32": 3, "ui16": 0, } } ] } Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/map.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index b2ec20e562bd5..df175bc33c5d6 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -169,9 +169,28 @@ static int do_dump_btf(const struct btf_dumper *d, if (ret) goto err_end_obj; - jsonw_name(d->jw, "value"); + if (!map_is_per_cpu(map_info->type)) { + jsonw_name(d->jw, "value"); + ret = btf_dumper_type(d, map_info->btf_value_type_id, value); + } else { + unsigned int i, n, step; - ret = btf_dumper_type(d, map_info->btf_value_type_id, value); + jsonw_name(d->jw, "values"); + jsonw_start_array(d->jw); + n = get_possible_cpus(); + step = round_up(map_info->value_size, 8); + for (i = 0; i < n; i++) { + jsonw_start_object(d->jw); + jsonw_int_field(d->jw, "cpu", i); + jsonw_name(d->jw, "value"); + ret = btf_dumper_type(d, map_info->btf_value_type_id, + value + i * step); + jsonw_end_object(d->jw); + if (ret) + break; + } + jsonw_end_array(d->jw); + } err_end_obj: /* end of key-value pair */ @@ -298,6 +317,16 @@ static void print_entry_json(struct bpf_map_info *info, unsigned char *key, jsonw_end_object(json_wtr); } jsonw_end_array(json_wtr); + if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = json_wtr, + .is_plain_text = false, + }; + + jsonw_name(json_wtr, "formatted"); + do_dump_btf(&d, info, key, value); + } } jsonw_end_object(json_wtr); From 18baed2684b02f752b5d800bb113e99730e1d24b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 30 Aug 2018 15:12:48 +0200 Subject: [PATCH 20/28] xsk: include XDP meta data in AF_XDP frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the AF_XDP (XDP_DRV/XDP_SKB copy-mode) ingress logic did not include XDP meta data in the data buffers copied out to the user application. In this commit, we check if meta data is available, and if so, it is prepended to the frame. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e937cd7c17dc..569048e299dfb 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -55,20 +55,30 @@ EXPORT_SYMBOL(xsk_umem_discard_addr); static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *buffer; + void *to_buf, *from_buf; + u32 metalen; u64 addr; int err; if (!xskq_peek_addr(xs->umem->fq, &addr) || - len > xs->umem->chunk_size_nohr) { + len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; } addr += xs->umem->headroom; - buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data, len); + if (unlikely(xdp_data_meta_unsupported(xdp))) { + from_buf = xdp->data; + metalen = 0; + } else { + from_buf = xdp->data_meta; + metalen = xdp->data - xdp->data_meta; + } + + to_buf = xdp_umem_get_data(xs->umem, addr); + memcpy(to_buf, from_buf, len + metalen); + addr += metalen; err = xskq_produce_batch_desc(xs->rx, addr, len); if (!err) { xskq_discard_addr(xs->umem->fq); @@ -111,6 +121,7 @@ void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 metalen = xdp->data - xdp->data_meta; u32 len = xdp->data_end - xdp->data; void *buffer; u64 addr; @@ -120,7 +131,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return -EINVAL; if (!xskq_peek_addr(xs->umem->fq, &addr) || - len > xs->umem->chunk_size_nohr) { + len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; } @@ -128,7 +139,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) addr += xs->umem->headroom; buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data, len); + memcpy(buffer, xdp->data_meta, len + metalen); + addr += metalen; err = xskq_produce_batch_desc(xs->rx, addr, len); if (!err) { xskq_discard_addr(xs->umem->fq); From 7296216776dbbe5094e218035debe1c0a5a3674a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 30 Aug 2018 15:27:18 +0100 Subject: [PATCH 21/28] xdp: remove redundant variable 'headroom' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable 'headroom' is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: variable ‘headroom’ set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Acked-by: Björn Töpel Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/xdp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 654dbb19707e7..4b2b194f4f1f3 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(xdp_attachment_setup); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) { - unsigned int metasize, headroom, totsize; + unsigned int metasize, totsize; void *addr, *data_to_copy; struct xdp_frame *xdpf; struct page *page; @@ -420,7 +420,6 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */ metasize = xdp_data_meta_unsupported(xdp) ? 0 : xdp->data - xdp->data_meta; - headroom = xdp->data - xdp->data_hard_start; totsize = xdp->data_end - xdp->data + metasize; if (sizeof(*xdpf) + totsize > PAGE_SIZE) From 1e215300f1384072c62459a1c194a08dbe05ff2c Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Thu, 30 Aug 2018 07:51:53 -0700 Subject: [PATCH 22/28] bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN options for bpf_(set|get)sockopt Adding support for two new bpf get/set sockopts: TCP_SAVE_SYN (set) and TCP_SAVED_SYN (get). This would allow for bpf program to build logic based on data from ingress SYN packet (e.g. doing tcp's tos/ tclass reflection (see sample prog)) and do it transparently from userspace program point of view. Signed-off-by: Nikita V. Shirokov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index c25eb36f13204..feb578506009f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4007,6 +4007,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, tp->snd_ssthresh = val; } break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + ret = -EINVAL; + else + tp->save_syn = val; + break; default: ret = -EINVAL; } @@ -4032,21 +4038,32 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + struct inet_connection_sock *icsk; struct sock *sk = bpf_sock->sk; + struct tcp_sock *tp; if (!sk_fullsock(sk)) goto err_clear; - #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { - if (optname == TCP_CONGESTION) { - struct inet_connection_sock *icsk = inet_csk(sk); + switch (optname) { + case TCP_CONGESTION: + icsk = inet_csk(sk); if (!icsk->icsk_ca_ops || optlen <= 1) goto err_clear; strncpy(optval, icsk->icsk_ca_ops->name, optlen); optval[optlen - 1] = 0; - } else { + break; + case TCP_SAVED_SYN: + tp = tcp_sk(sk); + + if (optlen <= 0 || !tp->saved_syn || + optlen > tp->saved_syn[0]) + goto err_clear; + memcpy(optval, tp->saved_syn + 1, optlen); + break; + default: goto err_clear; } } else if (level == SOL_IP) { From acb4ea9564152bba8b4c2d66f8c1ba21e7a94d90 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Thu, 30 Aug 2018 07:51:54 -0700 Subject: [PATCH 23/28] bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN sample program Sample program which shows TCP_SAVE_SYN/TCP_SAVED_SYN usage example: bpf program which is doing TOS/TCLASS reflection (server would reply with a same TOS/TCLASS as client). Signed-off-by: Nikita V. Shirokov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- samples/bpf/Makefile | 1 + samples/bpf/tcp_tos_reflect_kern.c | 87 ++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 samples/bpf/tcp_tos_reflect_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 36f9f41d094b2..be0a961450bc2 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -153,6 +153,7 @@ always += tcp_cong_kern.o always += tcp_iw_kern.o always += tcp_clamp_kern.o always += tcp_basertt_kern.o +always += tcp_tos_reflect_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o always += xdp_redirect_cpu_kern.o diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c new file mode 100644 index 0000000000000..d51dab19eca62 --- /dev/null +++ b/samples/bpf/tcp_tos_reflect_kern.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + * + * BPF program to automatically reflect TOS option from received syn packet + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char header[sizeof(struct ipv6hdr)]; + struct ipv6hdr *hdr6; + struct iphdr *hdr; + int hdr_size = 0; + int save_syn = 1; + int tos = 0; + int rv = 0; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + switch (op) { + case BPF_SOCK_OPS_TCP_LISTEN_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, + &save_syn, sizeof(save_syn)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + if (skops->family == AF_INET) + hdr_size = sizeof(struct iphdr); + else + hdr_size = sizeof(struct ipv6hdr); + rv = bpf_getsockopt(skops, SOL_TCP, TCP_SAVED_SYN, + header, hdr_size); + if (!rv) { + if (skops->family == AF_INET) { + hdr = (struct iphdr *) header; + tos = hdr->tos; + if (tos != 0) + bpf_setsockopt(skops, SOL_IP, IP_TOS, + &tos, sizeof(tos)); + } else { + hdr6 = (struct ipv6hdr *) header; + tos = ((hdr6->priority) << 4 | + (hdr6->flow_lbl[0]) >> 4); + if (tos) + bpf_setsockopt(skops, SOL_IPV6, + IPV6_TCLASS, + &tos, sizeof(tos)); + } + rv = 0; + } + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; From a29c8bb640ca13f5731510ceaf65a6e1fe6055c8 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Fri, 31 Aug 2018 09:59:16 +0900 Subject: [PATCH 24/28] xsk: remove unnecessary assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since xdp_umem_query() was added one assignment of bpf.command was missed from cleanup. Removing the assignment statement. Fixes: 84c6b86875e01a0 ("xsk: don't allow umem replace at stack level") Signed-off-by: Prashant Bhole Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index bfe2dbea480ba..d179732617dc8 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -76,8 +76,6 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */ - bpf.command = XDP_QUERY_XSK_UMEM; - rtnl_lock(); err = xdp_umem_query(dev, queue_id); if (err) { From 11c3f5113600c40cc8789f9a7023e02b73cb0ca7 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Fri, 31 Aug 2018 10:00:49 +0900 Subject: [PATCH 25/28] samples/bpf: xdpsock, minor fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - xsks_map size was fixed to 4, changed it MAX_SOCKS - Remove redundant definition of MAX_SOCKS in xdpsock_user.c - In dump_stats(), add NULL check for xsks[i] Signed-off-by: Prashant Bhole Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- samples/bpf/xdpsock_kern.c | 2 +- samples/bpf/xdpsock_user.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c index d8806c41362e2..b8ccd0802b3fa 100644 --- a/samples/bpf/xdpsock_kern.c +++ b/samples/bpf/xdpsock_kern.c @@ -16,7 +16,7 @@ struct bpf_map_def SEC("maps") xsks_map = { .type = BPF_MAP_TYPE_XSKMAP, .key_size = sizeof(int), .value_size = sizeof(int), - .max_entries = 4, + .max_entries = MAX_SOCKS, }; struct bpf_map_def SEC("maps") rr_map = { diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index b3906111bedb6..57ecadc584039 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -118,7 +118,6 @@ struct xdpsock { unsigned long prev_tx_npkts; }; -#define MAX_SOCKS 4 static int num_socks; struct xdpsock *xsks[MAX_SOCKS]; @@ -596,7 +595,7 @@ static void dump_stats(void) prev_time = now; - for (i = 0; i < num_socks; i++) { + for (i = 0; i < num_socks && xsks[i]; i++) { char *fmt = "%-15s %'-11.0f %'-11lu\n"; double rx_pps, tx_pps; From 9746b1ee2edcabfb8f40d5311f23ac2f01f152ed Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Fri, 31 Aug 2018 09:43:47 -0700 Subject: [PATCH 26/28] bpf: add selftest for bpf's (set|get)_sockopt for SAVE_SYN adding selftest for feature, introduced in commit 9452048c79404 ("bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN options for bpf_(set|get)sockopt"). Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann --- .../testing/selftests/bpf/test_tcpbpf_kern.c | 38 +++++++++++++++++-- .../testing/selftests/bpf/test_tcpbpf_user.c | 31 ++++++++++++++- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/test_tcpbpf_kern.c index 4b7fd540cea9d..74f73b33a7b07 100644 --- a/tools/testing/selftests/bpf/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -17,6 +18,13 @@ struct bpf_map_def SEC("maps") global_map = { .type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(__u32), .value_size = sizeof(struct tcpbpf_globals), + .max_entries = 4, +}; + +struct bpf_map_def SEC("maps") sockopt_results = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(int), .max_entries = 2, }; @@ -45,11 +53,14 @@ int _version SEC("version") = 1; SEC("sockops") int bpf_testcb(struct bpf_sock_ops *skops) { - int rv = -1; - int bad_call_rv = 0; + char header[sizeof(struct ipv6hdr) + sizeof(struct tcphdr)]; + struct tcphdr *thdr; int good_call_rv = 0; - int op; + int bad_call_rv = 0; + int save_syn = 1; + int rv = -1; int v = 0; + int op; op = (int) skops->op; @@ -82,6 +93,21 @@ int bpf_testcb(struct bpf_sock_ops *skops) v = 0xff; rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v)); + if (skops->family == AF_INET6) { + v = bpf_getsockopt(skops, IPPROTO_TCP, TCP_SAVED_SYN, + header, (sizeof(struct ipv6hdr) + + sizeof(struct tcphdr))); + if (!v) { + int offset = sizeof(struct ipv6hdr); + + thdr = (struct tcphdr *)(header + offset); + v = thdr->syn; + __u32 key = 1; + + bpf_map_update_elem(&sockopt_results, &key, &v, + BPF_ANY); + } + } break; case BPF_SOCK_OPS_RTO_CB: break; @@ -111,6 +137,12 @@ int bpf_testcb(struct bpf_sock_ops *skops) break; case BPF_SOCK_OPS_TCP_LISTEN_CB: bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG); + v = bpf_setsockopt(skops, IPPROTO_TCP, TCP_SAVE_SYN, + &save_syn, sizeof(save_syn)); + /* Update global map w/ result of setsock opt */ + __u32 key = 0; + + bpf_map_update_elem(&sockopt_results, &key, &v, BPF_ANY); break; default: rv = -1; diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c index a275c29713760..e6eebda7d112b 100644 --- a/tools/testing/selftests/bpf/test_tcpbpf_user.c +++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c @@ -54,6 +54,26 @@ int verify_result(const struct tcpbpf_globals *result) return -1; } +int verify_sockopt_result(int sock_map_fd) +{ + __u32 key = 0; + int res; + int rv; + + /* check setsockopt for SAVE_SYN */ + rv = bpf_map_lookup_elem(sock_map_fd, &key, &res); + EXPECT_EQ(0, rv, "d"); + EXPECT_EQ(0, res, "d"); + key = 1; + /* check getsockopt for SAVED_SYN */ + rv = bpf_map_lookup_elem(sock_map_fd, &key, &res); + EXPECT_EQ(0, rv, "d"); + EXPECT_EQ(1, res, "d"); + return 0; +err: + return -1; +} + static int bpf_find_map(const char *test, struct bpf_object *obj, const char *name) { @@ -70,11 +90,11 @@ static int bpf_find_map(const char *test, struct bpf_object *obj, int main(int argc, char **argv) { const char *file = "test_tcpbpf_kern.o"; + int prog_fd, map_fd, sock_map_fd; struct tcpbpf_globals g = {0}; const char *cg_path = "/foo"; int error = EXIT_FAILURE; struct bpf_object *obj; - int prog_fd, map_fd; int cg_fd = -1; __u32 key = 0; int rv; @@ -110,6 +130,10 @@ int main(int argc, char **argv) if (map_fd < 0) goto err; + sock_map_fd = bpf_find_map(__func__, obj, "sockopt_results"); + if (sock_map_fd < 0) + goto err; + rv = bpf_map_lookup_elem(map_fd, &key, &g); if (rv != 0) { printf("FAILED: bpf_map_lookup_elem returns %d\n", rv); @@ -121,6 +145,11 @@ int main(int argc, char **argv) goto err; } + if (verify_sockopt_result(sock_map_fd)) { + printf("FAILED: Wrong sockopt stats\n"); + goto err; + } + printf("PASSED!\n"); error = 0; err: From cf484f9f91f702e61eb47f7b142de6edc33a4a88 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 31 Aug 2018 13:40:01 +0200 Subject: [PATCH 27/28] i40e: fix possible compiler warning in xsk TX path With certain gcc versions, it was possible to get the warning "'tx_desc' may be used uninitialized in this function" for the i40e_xmit_zc. This was not possible, however this commit simplifies the code path so that this warning is no longer emitted. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 94947a826bc3a..41ca7e1310bc0 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -668,9 +668,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) **/ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) { - unsigned int total_packets = 0; + struct i40e_tx_desc *tx_desc = NULL; struct i40e_tx_buffer *tx_bi; - struct i40e_tx_desc *tx_desc; bool work_done = true; dma_addr_t dma; u32 len; @@ -697,14 +696,13 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP, 0, len, 0); - total_packets++; xdp_ring->next_to_use++; if (xdp_ring->next_to_use == xdp_ring->count) xdp_ring->next_to_use = 0; } - if (total_packets > 0) { + if (tx_desc) { /* Request an interrupt for the last frame and bump tail ptr. */ tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT); From 93ee30f3e8b412c5fc2d2f7d9d002529d9a209ad Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 31 Aug 2018 13:40:02 +0200 Subject: [PATCH 28/28] xsk: i40e: get rid of useless struct xdp_umem_props This commit gets rid of the structure xdp_umem_props. It was there to be able to break a dependency at one point, but this is no longer needed. The values in the struct are instead stored directly in the xdp_umem structure. This simplifies the xsk code as well as af_xdp zero-copy drivers and as a bonus gets rid of one internal header file. The i40e driver is also adapted to the new interface in this commit. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 ++-- include/net/xdp_sock.h | 8 ++------ net/xdp/xdp_umem.c | 4 ++-- net/xdp/xdp_umem_props.h | 14 -------------- net/xdp/xsk.c | 10 ++++++---- net/xdp/xsk_queue.c | 5 +++-- net/xdp/xsk_queue.h | 13 +++++++------ 7 files changed, 22 insertions(+), 36 deletions(-) delete mode 100644 net/xdp/xdp_umem_props.h diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 41ca7e1310bc0..2ebfc78bbd098 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -442,7 +442,7 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring, struct i40e_rx_buffer *old_bi) { struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc]; - unsigned long mask = (unsigned long)rx_ring->xsk_umem->props.chunk_mask; + unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask; u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; u16 nta = rx_ring->next_to_alloc; @@ -477,7 +477,7 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) rx_ring = container_of(alloc, struct i40e_ring, zca); hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; - mask = rx_ring->xsk_umem->props.chunk_mask; + mask = rx_ring->xsk_umem->chunk_mask; nta = rx_ring->next_to_alloc; bi = &rx_ring->rx_bi[nta]; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 56994ad1ab409..932ca0dad6f30 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -16,11 +16,6 @@ struct net_device; struct xsk_queue; -struct xdp_umem_props { - u64 chunk_mask; - u64 size; -}; - struct xdp_umem_page { void *addr; dma_addr_t dma; @@ -30,7 +25,8 @@ struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; struct xdp_umem_page *pages; - struct xdp_umem_props props; + u64 chunk_mask; + u64 size; u32 headroom; u32 chunk_size_nohr; struct user_struct *user; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index d179732617dc8..b3b632c5aeaee 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -312,8 +312,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pid = get_task_pid(current, PIDTYPE_PID); umem->address = (unsigned long)addr; - umem->props.chunk_mask = ~((u64)chunk_size - 1); - umem->props.size = size; + umem->chunk_mask = ~((u64)chunk_size - 1); + umem->size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; umem->npgs = size / PAGE_SIZE; diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h deleted file mode 100644 index 40eab10dfc49d..0000000000000 --- a/net/xdp/xdp_umem_props.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* XDP user-space packet buffer - * Copyright(c) 2018 Intel Corporation. - */ - -#ifndef XDP_UMEM_PROPS_H_ -#define XDP_UMEM_PROPS_H_ - -struct xdp_umem_props { - u64 chunk_mask; - u64 size; -}; - -#endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 569048e299dfb..5a432dfee4eee 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -470,8 +470,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } else { /* This xsk has its own umem. */ - xskq_set_umem(xs->umem->fq, &xs->umem->props); - xskq_set_umem(xs->umem->cq, &xs->umem->props); + xskq_set_umem(xs->umem->fq, xs->umem->size, + xs->umem->chunk_mask); + xskq_set_umem(xs->umem->cq, xs->umem->size, + xs->umem->chunk_mask); err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) @@ -481,8 +483,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xskq_set_umem(xs->rx, &xs->umem->props); - xskq_set_umem(xs->tx, &xs->umem->props); + xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); + xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); out_unlock: diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 6c32e92e98fcb..2dc1384d9f275 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -7,12 +7,13 @@ #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) +void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) { if (!q) return; - q->umem_props = *umem_props; + q->size = size; + q->chunk_mask = chunk_mask; } static u32 xskq_umem_get_ring_size(struct xsk_queue *q) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 8a64b150be546..82252cccb4e00 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -31,7 +31,8 @@ struct xdp_umem_ring { }; struct xsk_queue { - struct xdp_umem_props umem_props; + u64 chunk_mask; + u64 size; u32 ring_mask; u32 nentries; u32 prod_head; @@ -78,7 +79,7 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) { - if (addr >= q->umem_props.size) { + if (addr >= q->size) { q->invalid_descs++; return false; } @@ -92,7 +93,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask; + *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; if (xskq_is_valid_addr(q, *addr)) return addr; @@ -173,8 +174,8 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) if (!xskq_is_valid_addr(q, d->addr)) return false; - if (((d->addr + d->len) & q->umem_props.chunk_mask) != - (d->addr & q->umem_props.chunk_mask)) { + if (((d->addr + d->len) & q->chunk_mask) != + (d->addr & q->chunk_mask)) { q->invalid_descs++; return false; } @@ -253,7 +254,7 @@ static inline bool xskq_empty_desc(struct xsk_queue *q) return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; } -void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); +void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops);