diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index b99e7ffef7a15..e41015310a6e3 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -62,6 +62,12 @@ attribute-sets: type: u64 enum: xdp-act enum-as-flags: true + - + name: xdp_zc_max_segs + doc: max fragment count supported by ZC driver + type: u32 + checks: + min: 1 operations: list: diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index 1cc35de336a41..dceeb0d763aa2 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -462,8 +462,92 @@ XDP_OPTIONS getsockopt Gets options from an XDP socket. The only one supported so far is XDP_OPTIONS_ZEROCOPY which tells you if zero-copy is on or not. +Multi-Buffer Support +==================== + +With multi-buffer support, programs using AF_XDP sockets can receive +and transmit packets consisting of multiple buffers both in copy and +zero-copy mode. For example, a packet can consist of two +frames/buffers, one with the header and the other one with the data, +or a 9K Ethernet jumbo frame can be constructed by chaining together +three 4K frames. + +Some definitions: + +* A packet consists of one or more frames + +* A descriptor in one of the AF_XDP rings always refers to a single + frame. In the case the packet consists of a single frame, the + descriptor refers to the whole packet. + +To enable multi-buffer support for an AF_XDP socket, use the new bind +flag XDP_USE_SG. If this is not provided, all multi-buffer packets +will be dropped just as before. Note that the XDP program loaded also +needs to be in multi-buffer mode. This can be accomplished by using +"xdp.frags" as the section name of the XDP program used. + +To represent a packet consisting of multiple frames, a new flag called +XDP_PKT_CONTD is introduced in the options field of the Rx and Tx +descriptors. If it is true (1) the packet continues with the next +descriptor and if it is false (0) it means this is the last descriptor +of the packet. Why the reverse logic of end-of-packet (eop) flag found +in many NICs? Just to preserve compatibility with non-multi-buffer +applications that have this bit set to false for all packets on Rx, +and the apps set the options field to zero for Tx, as anything else +will be treated as an invalid descriptor. + +These are the semantics for producing packets onto AF_XDP Tx ring +consisting of multiple frames: + +* When an invalid descriptor is found, all the other + descriptors/frames of this packet are marked as invalid and not + completed. The next descriptor is treated as the start of a new + packet, even if this was not the intent (because we cannot guess + the intent). As before, if your program is producing invalid + descriptors you have a bug that must be fixed. + +* Zero length descriptors are treated as invalid descriptors. + +* For copy mode, the maximum supported number of frames in a packet is + equal to CONFIG_MAX_SKB_FRAGS + 1. If it is exceeded, all + descriptors accumulated so far are dropped and treated as + invalid. To produce an application that will work on any system + regardless of this config setting, limit the number of frags to 18, + as the minimum value of the config is 17. + +* For zero-copy mode, the limit is up to what the NIC HW + supports. Usually at least five on the NICs we have checked. We + consciously chose to not enforce a rigid limit (such as + CONFIG_MAX_SKB_FRAGS + 1) for zero-copy mode, as it would have + resulted in copy actions under the hood to fit into what limit the + NIC supports. Kind of defeats the purpose of zero-copy mode. How to + probe for this limit is explained in the "probe for multi-buffer + support" section. + +On the Rx path in copy-mode, the xsk core copies the XDP data into +multiple descriptors, if needed, and sets the XDP_PKT_CONTD flag as +detailed before. Zero-copy mode works the same, though the data is not +copied. When the application gets a descriptor with the XDP_PKT_CONTD +flag set to one, it means that the packet consists of multiple buffers +and it continues with the next buffer in the following +descriptor. When a descriptor with XDP_PKT_CONTD == 0 is received, it +means that this is the last buffer of the packet. AF_XDP guarantees +that only a complete packet (all frames in the packet) is sent to the +application. If there is not enough space in the AF_XDP Rx ring, all +frames of the packet will be dropped. + +If application reads a batch of descriptors, using for example the libxdp +interfaces, it is not guaranteed that the batch will end with a full +packet. It might end in the middle of a packet and the rest of the +buffers of that packet will arrive at the beginning of the next batch, +since the libxdp interface does not read the whole ring (unless you +have an enormous batch size or a very small ring size). + +An example program each for Rx and Tx multi-buffer support can be found +later in this document. + Usage -===== +----- In order to use AF_XDP sockets two parts are needed. The user-space application and the XDP program. For a complete setup and @@ -541,6 +625,131 @@ like this: But please use the libbpf functions as they are optimized and ready to use. Will make your life easier. +Usage Multi-Buffer Rx +--------------------- + +Here is a simple Rx path pseudo-code example (using libxdp interfaces +for simplicity). Error paths have been excluded to keep it short: + +.. code-block:: c + + void rx_packets(struct xsk_socket_info *xsk) + { + static bool new_packet = true; + u32 idx_rx = 0, idx_fq = 0; + static char *pkt; + + int rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); + + xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + + for (int i = 0; i < rcvd; i++) { + struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); + char *frag = xsk_umem__get_data(xsk->umem->buffer, desc->addr); + bool eop = !(desc->options & XDP_PKT_CONTD); + + if (new_packet) + pkt = frag; + else + add_frag_to_pkt(pkt, frag); + + if (eop) + process_pkt(pkt); + + new_packet = eop; + + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = desc->addr; + } + + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); + } + +Usage Multi-Buffer Tx +--------------------- + +Here is an example Tx path pseudo-code (using libxdp interfaces for +simplicity) ignoring that the umem is finite in size, and that we +eventually will run out of packets to send. Also assumes pkts.addr +points to a valid location in the umem. + +.. code-block:: c + + void tx_packets(struct xsk_socket_info *xsk, struct pkt *pkts, + int batch_size) + { + u32 idx, i, pkt_nb = 0; + + xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx); + + for (i = 0; i < batch_size;) { + u64 addr = pkts[pkt_nb].addr; + u32 len = pkts[pkt_nb].size; + + do { + struct xdp_desc *tx_desc; + + tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i++); + tx_desc->addr = addr; + + if (len > xsk_frame_size) { + tx_desc->len = xsk_frame_size; + tx_desc->options = XDP_PKT_CONTD; + } else { + tx_desc->len = len; + tx_desc->options = 0; + pkt_nb++; + } + len -= tx_desc->len; + addr += xsk_frame_size; + + if (i == batch_size) { + /* Remember len, addr, pkt_nb for next iteration. + * Skipped for simplicity. + */ + break; + } + } while (len); + } + + xsk_ring_prod__submit(&xsk->tx, i); + } + +Probing for Multi-Buffer Support +-------------------------------- + +To discover if a driver supports multi-buffer AF_XDP in SKB or DRV +mode, use the XDP_FEATURES feature of netlink in linux/netdev.h to +query for NETDEV_XDP_ACT_RX_SG support. This is the same flag as for +querying for XDP multi-buffer support. If XDP supports multi-buffer in +a driver, then AF_XDP will also support that in SKB and DRV mode. + +To discover if a driver supports multi-buffer AF_XDP in zero-copy +mode, use XDP_FEATURES and first check the NETDEV_XDP_ACT_XSK_ZEROCOPY +flag. If it is set, it means that at least zero-copy is supported and +you should go and check the netlink attribute +NETDEV_A_DEV_XDP_ZC_MAX_SEGS in linux/netdev.h. An unsigned integer +value will be returned stating the max number of frags that are +supported by this device in zero-copy mode. These are the possible +return values: + +1: Multi-buffer for zero-copy is not supported by this device, as max + one fragment supported means that multi-buffer is not possible. + +>=2: Multi-buffer is supported in zero-copy mode for this device. The + returned number signifies the max number of frags supported. + +For an example on how these are used through libbpf, please take a +look at tools/testing/selftests/bpf/xskxceiver.c. + +Multi-Buffer Support for Zero-Copy Drivers +------------------------------------------ + +Zero-copy drivers usually use the batched APIs for Rx and Tx +processing. Note that the Tx batch API guarantees that it will provide +a batch of Tx descriptors that ends with full packet at the end. This +to facilitate extending a zero-copy driver with multi-buffer support. + Sample application ================== diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 29ad1797adcea..982ae70c51e8c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3585,11 +3585,6 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) if (ring->xsk_pool) { ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); - /* For AF_XDP ZC, we disallow packets to span on - * multiple buffers, thus letting us skip that - * handling in the fast-path. - */ - chain_len = 1; ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL); @@ -13822,6 +13817,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi) NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_XSK_ZEROCOPY | NETDEV_XDP_ACT_RX_SG; + netdev->xdp_zc_max_segs = I40E_MAX_BUFFER_TXD; } else { /* Relate the VSI_VMDQ name to the VSI_MAIN name. Note that we * are still limited by IFNAMSIZ, but we're adding 'v%d\0' to diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 8b8bf4880faa6..0b3a27f118fb9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2284,8 +2284,8 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, * If the buffer is an EOP buffer, this function exits returning false, * otherwise return true indicating that this is in fact a non-EOP buffer. */ -static bool i40e_is_non_eop(struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc) +bool i40e_is_non_eop(struct i40e_ring *rx_ring, + union i40e_rx_desc *rx_desc) { /* if we are the last buffer then there is nothing else to do */ #define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 8c3d24012c54c..900b0d9ede9f5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -473,6 +473,8 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); +bool i40e_is_non_eop(struct i40e_ring *rx_ring, + union i40e_rx_desc *rx_desc); /** * i40e_get_head - Retrieve head from head writeback diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 05ec1181471ea..37f41c8a682fb 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -294,8 +294,14 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, { unsigned int totalsize = xdp->data_end - xdp->data_meta; unsigned int metasize = xdp->data - xdp->data_meta; + struct skb_shared_info *sinfo = NULL; struct sk_buff *skb; + u32 nr_frags = 0; + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } net_prefetch(xdp->data_meta); /* allocate a skb to store the frags */ @@ -312,6 +318,28 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, __skb_pull(skb, metasize); } + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + for (int i = 0; i < nr_frags; i++) { + struct skb_shared_info *skinfo = skb_shinfo(skb); + skb_frag_t *frag = &sinfo->frags[i]; + struct page *page; + void *addr; + + page = dev_alloc_page(); + if (!page) { + dev_kfree_skb(skb); + return NULL; + } + addr = page_to_virt(page); + + memcpy(addr, skb_frag_page(frag), skb_frag_size(frag)); + + __skb_fill_page_desc_noacc(skinfo, skinfo->nr_frags++, + addr, 0, skb_frag_size(frag)); + } + out: xsk_buff_free(xdp); return skb; @@ -322,14 +350,13 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring, union i40e_rx_desc *rx_desc, unsigned int *rx_packets, unsigned int *rx_bytes, - unsigned int size, unsigned int xdp_res, bool *failure) { struct sk_buff *skb; *rx_packets = 1; - *rx_bytes = size; + *rx_bytes = xdp_get_buff_len(xdp_buff); if (likely(xdp_res == I40E_XDP_REDIR) || xdp_res == I40E_XDP_TX) return; @@ -363,7 +390,6 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring, return; } - *rx_bytes = skb->len; i40e_process_skb_fields(rx_ring, rx_desc, skb); napi_gro_receive(&rx_ring->q_vector->napi, skb); return; @@ -374,6 +400,31 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring, WARN_ON_ONCE(1); } +static int +i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first, + struct xdp_buff *xdp, const unsigned int size) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first); + + if (!xdp_buff_has_frags(first)) { + sinfo->nr_frags = 0; + sinfo->xdp_frags_size = 0; + xdp_buff_set_frags_flag(first); + } + + if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { + xsk_buff_free(first); + return -ENOMEM; + } + + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, + virt_to_page(xdp->data_hard_start), 0, size); + sinfo->xdp_frags_size += size; + xsk_buff_add_frag(xdp); + + return 0; +} + /** * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring * @rx_ring: Rx ring @@ -384,13 +435,18 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring, int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; + u16 next_to_process = rx_ring->next_to_process; u16 next_to_clean = rx_ring->next_to_clean; u16 count_mask = rx_ring->count - 1; unsigned int xdp_res, xdp_xmit = 0; + struct xdp_buff *first = NULL; struct bpf_prog *xdp_prog; bool failure = false; u16 cleaned_count; + if (next_to_process != next_to_clean) + first = *i40e_rx_bi(rx_ring, next_to_clean); + /* NB! xdp_prog will always be !NULL, due to the fact that * this path is enabled by setting an XDP program. */ @@ -404,7 +460,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) unsigned int size; u64 qword; - rx_desc = I40E_RX_DESC(rx_ring, next_to_clean); + rx_desc = I40E_RX_DESC(rx_ring, next_to_process); qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); /* This memory barrier is needed to keep us from reading @@ -417,9 +473,9 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) i40e_clean_programming_status(rx_ring, rx_desc->raw.qword[0], qword); - bi = *i40e_rx_bi(rx_ring, next_to_clean); + bi = *i40e_rx_bi(rx_ring, next_to_process); xsk_buff_free(bi); - next_to_clean = (next_to_clean + 1) & count_mask; + next_to_process = (next_to_process + 1) & count_mask; continue; } @@ -428,22 +484,35 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) if (!size) break; - bi = *i40e_rx_bi(rx_ring, next_to_clean); + bi = *i40e_rx_bi(rx_ring, next_to_process); xsk_buff_set_size(bi, size); xsk_buff_dma_sync_for_cpu(bi, rx_ring->xsk_pool); - xdp_res = i40e_run_xdp_zc(rx_ring, bi, xdp_prog); - i40e_handle_xdp_result_zc(rx_ring, bi, rx_desc, &rx_packets, - &rx_bytes, size, xdp_res, &failure); + if (!first) + first = bi; + else if (i40e_add_xsk_frag(rx_ring, first, bi, size)) + break; + + next_to_process = (next_to_process + 1) & count_mask; + + if (i40e_is_non_eop(rx_ring, rx_desc)) + continue; + + xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog); + i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets, + &rx_bytes, xdp_res, &failure); + first->flags = 0; + next_to_clean = next_to_process; if (failure) break; total_rx_packets += rx_packets; total_rx_bytes += rx_bytes; xdp_xmit |= xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR); - next_to_clean = (next_to_clean + 1) & count_mask; + first = NULL; } rx_ring->next_to_clean = next_to_clean; + rx_ring->next_to_process = next_to_process; cleaned_count = (next_to_clean - rx_ring->next_to_use - 1) & count_mask; if (cleaned_count >= I40E_RX_BUFFER_WRITE) @@ -466,6 +535,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc, unsigned int *total_bytes) { + u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(desc); struct i40e_tx_desc *tx_desc; dma_addr_t dma; @@ -474,8 +544,7 @@ static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc, tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++); tx_desc->buffer_addr = cpu_to_le64(dma); - tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP, - 0, desc->len, 0); + tx_desc->cmd_type_offset_bsz = build_ctob(cmd, 0, desc->len, 0); *total_bytes += desc->len; } @@ -489,14 +558,14 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des u32 i; loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { + u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]); + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr); xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len); tx_desc = I40E_TX_DESC(xdp_ring, ntu++); tx_desc->buffer_addr = cpu_to_le64(dma); - tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | - I40E_TX_DESC_CMD_EOP, - 0, desc[i].len, 0); + tx_desc->cmd_type_offset_bsz = build_ctob(cmd, 0, desc[i].len, 0); *total_bytes += desc[i].len; } diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 4a12316f7b464..3367b8ba98518 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -408,7 +408,6 @@ static unsigned int ice_rx_offset(struct ice_rx_ring *rx_ring) */ static int ice_setup_rx_ctx(struct ice_rx_ring *ring) { - int chain_len = ICE_MAX_CHAINED_RX_BUFS; struct ice_vsi *vsi = ring->vsi; u32 rxdid = ICE_RXDID_FLEX_NIC; struct ice_rlan_ctx rlan_ctx; @@ -472,17 +471,11 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring) */ rlan_ctx.showiv = 0; - /* For AF_XDP ZC, we disallow packets to span on - * multiple buffers, thus letting us skip that - * handling in the fast-path. - */ - if (ring->xsk_pool) - chain_len = 1; /* Max packet size for this queue - must not be set to a larger value * than 5 x DBUF */ rlan_ctx.rxmax = min_t(u32, vsi->max_frame, - chain_len * ring->rx_buf_len); + ICE_MAX_CHAINED_RX_BUFS * ring->rx_buf_len); /* Rx queue threshold in units of 64 */ rlan_ctx.lrxqthresh = 1; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 19a5e7f3a075e..ca83379b2de12 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3392,6 +3392,7 @@ static void ice_set_ops(struct ice_vsi *vsi) netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_XSK_ZEROCOPY | NETDEV_XDP_ACT_RX_SG; + netdev->xdp_zc_max_segs = ICE_MAX_BUF_TXD; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index a7fe2b4ce6552..2a3f0834e1391 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -545,19 +545,6 @@ bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) return __ice_alloc_rx_bufs_zc(rx_ring, leftover); } -/** - * ice_bump_ntc - Bump the next_to_clean counter of an Rx ring - * @rx_ring: Rx ring - */ -static void ice_bump_ntc(struct ice_rx_ring *rx_ring) -{ - int ntc = rx_ring->next_to_clean + 1; - - ntc = (ntc < rx_ring->count) ? ntc : 0; - rx_ring->next_to_clean = ntc; - prefetch(ICE_RX_DESC(rx_ring, ntc)); -} - /** * ice_construct_skb_zc - Create an sk_buff from zero-copy buffer * @rx_ring: Rx ring @@ -572,8 +559,14 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) { unsigned int totalsize = xdp->data_end - xdp->data_meta; unsigned int metasize = xdp->data - xdp->data_meta; + struct skb_shared_info *sinfo = NULL; struct sk_buff *skb; + u32 nr_frags = 0; + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } net_prefetch(xdp->data_meta); skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, @@ -589,6 +582,29 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) __skb_pull(skb, metasize); } + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + for (int i = 0; i < nr_frags; i++) { + struct skb_shared_info *skinfo = skb_shinfo(skb); + skb_frag_t *frag = &sinfo->frags[i]; + struct page *page; + void *addr; + + page = dev_alloc_page(); + if (!page) { + dev_kfree_skb(skb); + return NULL; + } + addr = page_to_virt(page); + + memcpy(addr, skb_frag_page(frag), skb_frag_size(frag)); + + __skb_fill_page_desc_noacc(skinfo, skinfo->nr_frags++, + addr, 0, skb_frag_size(frag)); + } + +out: xsk_buff_free(xdp); return skb; } @@ -597,7 +613,7 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) * ice_clean_xdp_irq_zc - produce AF_XDP descriptors to CQ * @xdp_ring: XDP Tx ring */ -static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) +static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) { u16 ntc = xdp_ring->next_to_clean; struct ice_tx_desc *tx_desc; @@ -619,7 +635,7 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) } if (!completed_frames) - return; + return 0; if (likely(!xdp_ring->xdp_tx_active)) { xsk_frames = completed_frames; @@ -649,6 +665,8 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) xdp_ring->next_to_clean -= cnt; if (xsk_frames) xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames); + + return completed_frames; } /** @@ -666,37 +684,72 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) { + struct skb_shared_info *sinfo = NULL; u32 size = xdp->data_end - xdp->data; u32 ntu = xdp_ring->next_to_use; struct ice_tx_desc *tx_desc; struct ice_tx_buf *tx_buf; - dma_addr_t dma; - - if (ICE_DESC_UNUSED(xdp_ring) < ICE_RING_QUARTER(xdp_ring)) { - ice_clean_xdp_irq_zc(xdp_ring); - if (!ICE_DESC_UNUSED(xdp_ring)) { - xdp_ring->ring_stats->tx_stats.tx_busy++; - return ICE_XDP_CONSUMED; - } + struct xdp_buff *head; + u32 nr_frags = 0; + u32 free_space; + u32 frag = 0; + + free_space = ICE_DESC_UNUSED(xdp_ring); + if (free_space < ICE_RING_QUARTER(xdp_ring)) + free_space += ice_clean_xdp_irq_zc(xdp_ring); + + if (unlikely(!free_space)) + goto busy; + + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + if (free_space < nr_frags + 1) + goto busy; } - dma = xsk_buff_xdp_get_dma(xdp); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, size); - - tx_buf = &xdp_ring->tx_buf[ntu]; - tx_buf->xdp = xdp; - tx_buf->type = ICE_TX_BUF_XSK_TX; tx_desc = ICE_TX_DESC(xdp_ring, ntu); - tx_desc->buf_addr = cpu_to_le64(dma); - tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP, - 0, size, 0); - xdp_ring->xdp_tx_active++; + tx_buf = &xdp_ring->tx_buf[ntu]; + head = xdp; + + for (;;) { + dma_addr_t dma; + + dma = xsk_buff_xdp_get_dma(xdp); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, size); + + tx_buf->xdp = xdp; + tx_buf->type = ICE_TX_BUF_XSK_TX; + tx_desc->buf_addr = cpu_to_le64(dma); + tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0); + /* account for each xdp_buff from xsk_buff_pool */ + xdp_ring->xdp_tx_active++; + + if (++ntu == xdp_ring->count) + ntu = 0; + + if (frag == nr_frags) + break; + + tx_desc = ICE_TX_DESC(xdp_ring, ntu); + tx_buf = &xdp_ring->tx_buf[ntu]; + + xdp = xsk_buff_get_frag(head); + size = skb_frag_size(&sinfo->frags[frag]); + frag++; + } - if (++ntu == xdp_ring->count) - ntu = 0; xdp_ring->next_to_use = ntu; + /* update last descriptor from a frame with EOP */ + tx_desc->cmd_type_offset_bsz |= + cpu_to_le64(ICE_TX_DESC_CMD_EOP << ICE_TXD_QW1_CMD_S); return ICE_XDP_TX; + +busy: + xdp_ring->ring_stats->tx_stats.tx_busy++; + + return ICE_XDP_CONSUMED; } /** @@ -752,6 +805,34 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, return result; } +static int +ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first, + struct xdp_buff *xdp, const unsigned int size) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first); + + if (!size) + return 0; + + if (!xdp_buff_has_frags(first)) { + sinfo->nr_frags = 0; + sinfo->xdp_frags_size = 0; + xdp_buff_set_frags_flag(first); + } + + if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { + xsk_buff_free(first); + return -ENOMEM; + } + + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, + virt_to_page(xdp->data_hard_start), 0, size); + sinfo->xdp_frags_size += size; + xsk_buff_add_frag(xdp); + + return 0; +} + /** * ice_clean_rx_irq_zc - consumes packets from the hardware ring * @rx_ring: AF_XDP Rx ring @@ -762,9 +843,14 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; + struct xsk_buff_pool *xsk_pool = rx_ring->xsk_pool; + u32 ntc = rx_ring->next_to_clean; + u32 ntu = rx_ring->next_to_use; + struct xdp_buff *first = NULL; struct ice_tx_ring *xdp_ring; unsigned int xdp_xmit = 0; struct bpf_prog *xdp_prog; + u32 cnt = rx_ring->count; bool failure = false; int entries_to_alloc; @@ -774,6 +860,9 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) xdp_prog = READ_ONCE(rx_ring->xdp_prog); xdp_ring = rx_ring->xdp_ring; + if (ntc != rx_ring->first_desc) + first = *ice_xdp_buf(rx_ring, rx_ring->first_desc); + while (likely(total_rx_packets < (unsigned int)budget)) { union ice_32b_rx_flex_desc *rx_desc; unsigned int size, xdp_res = 0; @@ -783,7 +872,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) u16 vlan_tag = 0; u16 rx_ptype; - rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean); + rx_desc = ICE_RX_DESC(rx_ring, ntc); stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S); if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits)) @@ -795,51 +884,61 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) */ dma_rmb(); - if (unlikely(rx_ring->next_to_clean == rx_ring->next_to_use)) + if (unlikely(ntc == ntu)) break; - xdp = *ice_xdp_buf(rx_ring, rx_ring->next_to_clean); + xdp = *ice_xdp_buf(rx_ring, ntc); size = le16_to_cpu(rx_desc->wb.pkt_len) & ICE_RX_FLX_DESC_PKT_LEN_M; - if (!size) { - xdp->data = NULL; - xdp->data_end = NULL; - xdp->data_hard_start = NULL; - xdp->data_meta = NULL; - goto construct_skb; - } xsk_buff_set_size(xdp, size); - xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool); + xsk_buff_dma_sync_for_cpu(xdp, xsk_pool); + + if (!first) { + first = xdp; + xdp_buff_clear_frags_flag(first); + } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) { + break; + } + + if (++ntc == cnt) + ntc = 0; - xdp_res = ice_run_xdp_zc(rx_ring, xdp, xdp_prog, xdp_ring); + if (ice_is_non_eop(rx_ring, rx_desc)) + continue; + + xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring); if (likely(xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))) { xdp_xmit |= xdp_res; } else if (xdp_res == ICE_XDP_EXIT) { failure = true; + first = NULL; + rx_ring->first_desc = ntc; break; } else if (xdp_res == ICE_XDP_CONSUMED) { - xsk_buff_free(xdp); + xsk_buff_free(first); } else if (xdp_res == ICE_XDP_PASS) { goto construct_skb; } - total_rx_bytes += size; + total_rx_bytes += xdp_get_buff_len(first); total_rx_packets++; - ice_bump_ntc(rx_ring); + first = NULL; + rx_ring->first_desc = ntc; continue; construct_skb: /* XDP_PASS path */ - skb = ice_construct_skb_zc(rx_ring, xdp); + skb = ice_construct_skb_zc(rx_ring, first); if (!skb) { rx_ring->ring_stats->rx_stats.alloc_buf_failed++; break; } - ice_bump_ntc(rx_ring); + first = NULL; + rx_ring->first_desc = ntc; if (eth_skb_pad(skb)) { skb = NULL; @@ -858,18 +957,22 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) ice_receive_skb(rx_ring, skb, vlan_tag); } - entries_to_alloc = ICE_DESC_UNUSED(rx_ring); + rx_ring->next_to_clean = ntc; + entries_to_alloc = ICE_RX_DESC_UNUSED(rx_ring); if (entries_to_alloc > ICE_RING_QUARTER(rx_ring)) failure |= !ice_alloc_rx_bufs_zc(rx_ring, entries_to_alloc); ice_finalize_xdp_rx(xdp_ring, xdp_xmit, 0); ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes); - if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) { - if (failure || rx_ring->next_to_clean == rx_ring->next_to_use) - xsk_set_rx_need_wakeup(rx_ring->xsk_pool); + if (xsk_uses_need_wakeup(xsk_pool)) { + /* ntu could have changed when allocating entries above, so + * use rx_ring value instead of stack based one + */ + if (failure || ntc == rx_ring->next_to_use) + xsk_set_rx_need_wakeup(xsk_pool); else - xsk_clear_rx_need_wakeup(rx_ring->xsk_pool); + xsk_clear_rx_need_wakeup(xsk_pool); return (int)total_rx_packets; } @@ -894,7 +997,7 @@ static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, struct xdp_desc *desc, tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_to_use++); tx_desc->buf_addr = cpu_to_le64(dma); - tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP, + tx_desc->cmd_type_offset_bsz = ice_build_ctob(xsk_is_eop_desc(desc), 0, desc->len, 0); *total_bytes += desc->len; @@ -921,7 +1024,7 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *de tx_desc = ICE_TX_DESC(xdp_ring, ntu++); tx_desc->buf_addr = cpu_to_le64(dma); - tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP, + tx_desc->cmd_type_offset_bsz = ice_build_ctob(xsk_is_eop_desc(&descs[i]), 0, descs[i].len, 0); *total_bytes += descs[i].len; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b828c7a75be20..b12477ea4032b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2250,6 +2250,7 @@ struct net_device { #define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; unsigned int gro_ipv4_max_size; + unsigned int xdp_zc_max_segs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e96a1151ec759..1617af3801620 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -52,6 +52,7 @@ struct xdp_sock { struct xsk_buff_pool *pool; u16 queue_id; bool zc; + bool sg; enum { XSK_READY = 0, XSK_BOUND, @@ -67,6 +68,12 @@ struct xdp_sock { u64 rx_dropped; u64 rx_queue_full; + /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current + * packet, the partially built skb is saved here so that packet building can resume in next + * call of __xsk_generic_xmit(). + */ + struct sk_buff *skb; + struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index c243f906ebed4..1f6fc8c7a84c6 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -89,6 +89,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return !xp_mb_desc(desc); +} + /* Returns as many entries as possible up to max. 0 <= N <= max. */ static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { @@ -103,10 +108,45 @@ static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + struct list_head *xskb_list = &xskb->pool->xskb_list; + struct xdp_buff_xsk *pos, *tmp; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + list_del(&pos->xskb_list_node); + xp_free(pos); + } + + xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; +out: xp_free(xskb); } +static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); + + list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); +} + +static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff *ret = NULL; + struct xdp_buff_xsk *frag; + + frag = list_first_entry_or_null(&xskb->pool->xskb_list, + struct xdp_buff_xsk, xskb_list_node); + if (frag) { + list_del(&frag->xskb_list_node); + ret = &frag->xdp; + } + + return ret; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; @@ -241,6 +281,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return false; +} + static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return 0; @@ -255,6 +300,15 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } +static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +{ + return NULL; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index a8d7b8a3688a6..b0bdff26fc882 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,6 +29,7 @@ struct xdp_buff_xsk { struct xsk_buff_pool *pool; u64 orig_addr; struct list_head free_list_node; + struct list_head xskb_list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) @@ -54,6 +55,7 @@ struct xsk_buff_pool { struct xdp_umem *umem; struct work_struct work; struct list_head free_list; + struct list_head xskb_list; u32 heads_cnt; u16 queue_id; @@ -184,6 +186,11 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } +static inline bool xp_mb_desc(struct xdp_desc *desc) +{ + return desc->options & XDP_PKT_CONTD; +} + static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) { return addr & pool->chunk_mask; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index a78a8096f4ce6..8d48863472b92 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -25,6 +25,12 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* By setting this option, userspace application indicates that it can + * handle multiple descriptors per packet thus enabling AF_XDP to split + * multi-buffer XDP frames into multiple Rx descriptors. Without this set + * such frames will be dropped. + */ +#define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) @@ -108,4 +114,11 @@ struct xdp_desc { /* UMEM descriptor is __u64 */ +/* Flag indicating that the packet continues with the buffer pointed out by the + * next frame in the ring. The end of the packet is signalled by setting this + * bit to zero. For single buffer packets, every descriptor has 'options' set + * to 0 and this maintains backward compatibility. + */ +#define XDP_PKT_CONTD (1 << 0) + #endif /* _LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 639524b59930b..bf71698a1e82a 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -41,6 +41,7 @@ enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, + NETDEV_A_DEV_XDP_ZC_MAX_SEGS, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index d6e1b786c5c52..dd4f114a7cbfe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10613,6 +10613,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; diff --git a/net/core/filter.c b/net/core/filter.c index 06ba0e56e3693..b4410dc841a0e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4345,13 +4345,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; - if (map_type == BPF_MAP_TYPE_XSKMAP) { - /* XDP_REDIRECT is not supported AF_XDP yet. */ - if (unlikely(xdp_buff_has_frags(xdp))) - return -EOPNOTSUPP; - + if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); - } return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a4270fafdf119..65ef4867fc495 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -25,6 +25,14 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, return -EINVAL; } + if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + netdev->xdp_zc_max_segs)) { + genlmsg_cancel(rsp, hdr); + return -EINVAL; + } + } + genlmsg_end(rsp, hdr); return 0; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 31dca4ecb2c53..4f1e0599146e2 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -135,14 +135,14 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) { - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u64 addr; int err; addr = xp_get_handle(xskb); - err = xskq_prod_reserve_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; @@ -152,48 +152,138 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; } -static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *from_buf, *to_buf; - u32 metalen; + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u32 frags = xdp_buff_has_frags(xdp); + struct xdp_buff_xsk *pos, *tmp; + struct list_head *xskb_list; + u32 contd = 0; + int err; - if (unlikely(xdp_data_meta_unsupported(from))) { - from_buf = from->data; - to_buf = to->data; - metalen = 0; - } else { - from_buf = from->data_meta; - metalen = from->data - from->data_meta; - to_buf = to->data - metalen; + if (frags) + contd = XDP_PKT_CONTD; + + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err || likely(!frags)) + goto out; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + if (list_is_singular(xskb_list)) + contd = 0; + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) + return err; + list_del(&pos->xskb_list_node); } - memcpy(to_buf, from_buf, len + metalen); +out: + return err; } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static void *xsk_copy_xdp_start(struct xdp_buff *from) { + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; +} + +static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) +{ + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; + + to_len -= copy_len; + to += copy_len; + } +} + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; + struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; - int err; - u32 len; + skb_frag_t *frag; - len = xdp->data_end - xdp->data; - if (len > xsk_pool_get_rx_frame_size(xs->pool)) { - xs->rx_dropped++; - return -ENOSPC; + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; + + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; } - xsk_xdp = xsk_buff_alloc(xs->pool); - if (!xsk_xdp) { + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; + } - xsk_copy_xdp(xsk_xdp, xdp, len); - err = __xsk_rcv_zc(xs, xsk_xdp, len); - if (err) { - xsk_buff_free(xsk_xdp); - return err; + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; } + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + return 0; } @@ -215,7 +305,7 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; @@ -223,6 +313,11 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + xs->rx_dropped++; + return -ENOSPC; + } + sk_mark_napi_id_once_xdp(&xs->sk, xdp); return 0; } @@ -236,12 +331,13 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; spin_lock_bh(&xs->rx_lock); - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (!err) { - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); } spin_unlock_bh(&xs->rx_lock); @@ -250,19 +346,19 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; - u32 len; - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (err) return err; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; - return __xsk_rcv_zc(xs, xdp, len); + return xsk_rcv_zc(xs, xdp, len); } - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; @@ -321,7 +417,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) + xskq_cons_release(xs->tx); continue; } @@ -408,37 +505,91 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static void xsk_destruct_skb(struct sk_buff *skb) +static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(xs->pool->cq, addr); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + + return ret; +} + +static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) { - u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; - struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_submit_addr(xs->pool->cq, addr); + xskq_prod_submit_n(xs->pool->cq, n); spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; +} + +static void xsk_destruct_skb(struct sk_buff *skb) +{ + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); sock_wfree(skb); } +static void xsk_set_destructor_arg(struct sk_buff *skb) +{ + long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; + + skb_shinfo(skb)->destructor_arg = (void *)num; +} + +static void xsk_consume_skb(struct sk_buff *skb) +{ + struct xdp_sock *xs = xdp_sk(skb->sk); + + skb->destructor = sock_wfree; + xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + /* Free skb without triggering the perf drop trace */ + consume_skb(skb); + xs->skb = NULL; +} + +static void xsk_drop_skb(struct sk_buff *skb) +{ + xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); + xsk_consume_skb(skb); +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); - skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); - skb_reserve(skb, hr); + skb_reserve(skb, hr); + } addr = desc->addr; len = desc->len; @@ -448,7 +599,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, offset = offset_in_page(buffer); addr = buffer - pool->addrs; - for (copied = 0, i = 0; copied < len; i++) { + for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { + if (unlikely(i >= MAX_SKB_FRAGS)) + return ERR_PTR(-EFAULT); + page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); @@ -473,43 +627,77 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct net_device *dev = xs->dev; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; + int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); - if (IS_ERR(skb)) - return skb; + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + goto free_err; + } } else { u32 hr, tr, len; void *buffer; - int err; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); - tr = dev->needed_tailroom; + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; - skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + goto free_err; - skb_reserve(skb, hr); - skb_put(skb, len); + skb_reserve(skb, hr); + skb_put(skb, len); - buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); - err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { - kfree_skb(skb); - return ERR_PTR(err); + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) + goto free_err; + } else { + int nr_frags = skb_shinfo(skb)->nr_frags; + struct page *page; + u8 *vaddr; + + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { + err = -EFAULT; + goto free_err; + } + + page = alloc_page(xs->sk.sk_allocation); + if (unlikely(!page)) { + err = -EAGAIN; + goto free_err; + } + + vaddr = kmap_local_page(page); + memcpy(vaddr, buffer, len); + kunmap_local(vaddr); + + skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); } } skb->dev = dev; skb->priority = xs->sk.sk_priority; skb->mark = xs->sk.sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; skb->destructor = xsk_destruct_skb; + xsk_set_destructor_arg(skb); return skb; + +free_err: + if (err == -EAGAIN) { + xsk_cq_cancel_locked(xs, 1); + } else { + xsk_set_destructor_arg(skb); + xsk_drop_skb(skb); + xskq_cons_release(xs->tx); + } + + return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) @@ -519,7 +707,6 @@ static int __xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; - unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -544,47 +731,51 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - spin_lock_irqsave(&xs->pool->cq_lock, flags); - if (xskq_prod_reserve(xs->pool->cq)) { - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + if (xsk_cq_reserve_addr_locked(xs, desc.addr)) goto out; - } - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - goto out; + if (err == -EAGAIN) + goto out; + err = 0; + continue; + } + + xskq_cons_release(xs->tx); + + if (xp_mb_desc(&desc)) { + xs->skb = skb; + continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ - skb->destructor = sock_wfree; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - /* Free skb without triggering the perf drop trace */ - consume_skb(skb); + xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); + xsk_consume_skb(skb); err = -EAGAIN; goto out; } - xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; + xs->skb = NULL; goto out; } sent_frame = true; + xs->skb = NULL; } - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) { + if (xs->skb) + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } out: if (sent_frame) @@ -834,6 +1025,9 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); + if (xs->skb) + xsk_drop_skb(xs->skb); + mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); @@ -897,7 +1091,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | - XDP_USE_NEED_WAKEUP)) + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); @@ -929,7 +1123,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || - (flags & XDP_USE_NEED_WAKEUP)) { + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -1028,6 +1222,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->dev = dev; xs->zc = xs->umem->zc; + xs->sg = !!(flags & XDP_USE_SG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 26f6d304451e9..b3f7b310811ed 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -86,6 +86,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->umem = umem; pool->addrs = umem->addrs; INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); spin_lock_init(&pool->cq_lock); @@ -99,6 +100,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -187,6 +189,11 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_pool; } + if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + bpf.command = XDP_SETUP_XSK_POOL; bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6d40a77fccbe9..13354a1e42801 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -48,6 +48,11 @@ struct xsk_queue { size_t ring_vmalloc_size; }; +struct parsed_desc { + u32 mb; + u32 valid; +}; + /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and @@ -130,18 +135,26 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; } +static inline bool xp_unused_options_set(u32 options) +{ + return options & ~XDP_PKT_CONTD; +} + static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 offset = desc->addr & (pool->chunk_size - 1); + if (!desc->len) + return false; + if (offset + desc->len > pool->chunk_size) return false; if (desc->addr >= pool->addrs_cnt) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -151,6 +164,9 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + if (!desc->len) + return false; + if (desc->len > pool->chunk_size) return false; @@ -158,7 +174,7 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -170,6 +186,11 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool, xp_aligned_validate_desc(pool, desc); } +static inline bool xskq_has_descs(struct xsk_queue *q) +{ + return q->cached_cons != q->cached_prod; +} + static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) @@ -185,17 +206,15 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { - while (q->cached_cons != q->cached_prod) { + if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; - if (xskq_cons_is_valid_desc(q, desc, pool)) - return true; - - q->cached_cons++; + return xskq_cons_is_valid_desc(q, desc, pool); } + q->queue_empty_descs++; return false; } @@ -204,30 +223,52 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) q->cached_cons += cnt; } -static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, - u32 max) +static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, + struct xdp_desc *desc, struct parsed_desc *parsed) +{ + parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); + parsed->mb = xp_mb_desc(desc); +} + +static inline +u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, + u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; + u32 total_descs = 0, nr_frags = 0; + /* track first entry, if stumble upon *any* invalid descriptor, rewind + * current packet that consists of frags and stop the processing + */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; + struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; - if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { - /* Skip the entry */ - cached_cons++; - continue; + cached_cons++; + parse_desc(q, pool, &descs[nb_entries], &parsed); + if (unlikely(!parsed.valid)) + break; + + if (likely(!parsed.mb)) { + total_descs += (nr_frags + 1); + nr_frags = 0; + } else { + nr_frags++; + if (nr_frags == pool->netdev->xdp_zc_max_segs) { + nr_frags = 0; + break; + } } - nb_entries++; - cached_cons++; } + cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); - return nb_entries; + return total_descs; } /* Functions for consumers */ @@ -292,6 +333,11 @@ static inline void xskq_cons_release(struct xsk_queue *q) q->cached_cons++; } +static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons -= cnt; +} + static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -319,9 +365,9 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return xskq_prod_nb_free(q, 1) ? false : true; } -static inline void xskq_prod_cancel(struct xsk_queue *q) +static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { - q->cached_prod--; + q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -360,7 +406,7 @@ static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_de } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, - u64 addr, u32 len) + u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; @@ -372,6 +418,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; + ring->desc[idx].options = flags; return 0; } @@ -386,16 +433,6 @@ static inline void xskq_prod_submit(struct xsk_queue *q) __xskq_prod_submit(q, q->cached_prod); } -static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 idx = q->ring->producer; - - ring->desc[idx++ & q->ring_mask] = addr; - - __xskq_prod_submit(q, idx); -} - static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index a78a8096f4ce6..73a47da885dc1 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -25,6 +25,12 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* By setting this option, userspace application indicates that it can + * handle multiple descriptors per packet thus enabling xsk core to split + * multi-buffer XDP frames into multiple Rx descriptors. Without this set + * such frames will be dropped by xsk. + */ +#define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) @@ -106,6 +112,9 @@ struct xdp_desc { __u32 options; }; +/* Flag indicating packet constitutes of multiple buffers*/ +#define XDP_PKT_CONTD (1 << 0) + /* UMEM descriptor is __u64 */ #endif /* _LINUX_IF_XDP_H */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 639524b59930b..bf71698a1e82a 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -41,6 +41,7 @@ enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, + NETDEV_A_DEV_XDP_ZC_MAX_SEGS, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 10642ad69d763..674e5788eb108 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1105,9 +1105,10 @@ struct bpf_xdp_query_opts { __u32 skb_prog_id; /* output */ __u8 attach_mode; /* output */ __u64 feature_flags; /* output */ + __u32 xdp_zc_max_segs; /* output */ size_t :0; }; -#define bpf_xdp_query_opts__last_field feature_flags +#define bpf_xdp_query_opts__last_field xdp_zc_max_segs LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts); diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 84dd5fa149058..090bcf6e3b3d5 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -45,6 +45,7 @@ struct xdp_id_md { struct xdp_features_md { int ifindex; + __u32 xdp_zc_max_segs; __u64 flags; }; @@ -421,6 +422,9 @@ static int parse_xdp_features(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, return NL_CONT; md->flags = libbpf_nla_getattr_u64(tb[NETDEV_A_DEV_XDP_FEATURES]); + if (tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]) + md->xdp_zc_max_segs = + libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]); return NL_DONE; } @@ -493,6 +497,7 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) return libbpf_err(err); opts->feature_flags = md.flags; + opts->xdp_zc_max_segs = md.xdp_zc_max_segs; skip_feature_flags: return 0; diff --git a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c index a630c95c74716..24369f2428537 100644 --- a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c +++ b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c @@ -15,12 +15,12 @@ struct { static unsigned int idx; int count = 0; -SEC("xdp") int xsk_def_prog(struct xdp_md *xdp) +SEC("xdp.frags") int xsk_def_prog(struct xdp_md *xdp) { return bpf_redirect_map(&xsk, 0, XDP_DROP); } -SEC("xdp") int xsk_xdp_drop(struct xdp_md *xdp) +SEC("xdp.frags") int xsk_xdp_drop(struct xdp_md *xdp) { /* Drop every other packet */ if (idx++ % 2) @@ -29,7 +29,7 @@ SEC("xdp") int xsk_xdp_drop(struct xdp_md *xdp) return bpf_redirect_map(&xsk, 0, XDP_DROP); } -SEC("xdp") int xsk_xdp_populate_metadata(struct xdp_md *xdp) +SEC("xdp.frags") int xsk_xdp_populate_metadata(struct xdp_md *xdp) { void *data, *data_meta; struct xdp_info *meta; diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index c2ad50f26b636..2aa5a3445056a 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -171,7 +171,10 @@ exec_xskxceiver if [ -z $ETH ]; then cleanup_exit ${VETH0} ${VETH1} +else + cleanup_iface ${ETH} ${MTU} fi + TEST_NAME="XSK_SELFTESTS_${VETH0}_BUSY_POLL" busy_poll=1 @@ -184,6 +187,8 @@ exec_xskxceiver if [ -z $ETH ]; then cleanup_exit ${VETH0} ${VETH1} +else + cleanup_iface ${ETH} ${MTU} fi failures=0 diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index 687d83e707f83..d9fb2b730a2cd 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -18,17 +18,19 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include #include #include #include #include -#include #include #include @@ -81,6 +83,12 @@ struct xsk_socket { int fd; }; +struct nl_mtu_req { + struct nlmsghdr nh; + struct ifinfomsg msg; + char buf[512]; +}; + int xsk_umem__fd(const struct xsk_umem *umem) { return umem ? umem->fd : -EINVAL; @@ -286,6 +294,132 @@ bool xsk_is_in_mode(u32 ifindex, int mode) return false; } +/* Lifted from netlink.c in tools/lib/bpf */ +static int netlink_recvmsg(int sock, struct msghdr *mhdr, int flags) +{ + int len; + + do { + len = recvmsg(sock, mhdr, flags); + } while (len < 0 && (errno == EINTR || errno == EAGAIN)); + + if (len < 0) + return -errno; + return len; +} + +/* Lifted from netlink.c in tools/lib/bpf */ +static int alloc_iov(struct iovec *iov, int len) +{ + void *nbuf; + + nbuf = realloc(iov->iov_base, len); + if (!nbuf) + return -ENOMEM; + + iov->iov_base = nbuf; + iov->iov_len = len; + return 0; +} + +/* Original version lifted from netlink.c in tools/lib/bpf */ +static int netlink_recv(int sock) +{ + struct iovec iov = {}; + struct msghdr mhdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + bool multipart = true; + struct nlmsgerr *err; + struct nlmsghdr *nh; + int len, ret; + + ret = alloc_iov(&iov, 4096); + if (ret) + goto done; + + while (multipart) { + multipart = false; + len = netlink_recvmsg(sock, &mhdr, MSG_PEEK | MSG_TRUNC); + if (len < 0) { + ret = len; + goto done; + } + + if (len > iov.iov_len) { + ret = alloc_iov(&iov, len); + if (ret) + goto done; + } + + len = netlink_recvmsg(sock, &mhdr, 0); + if (len < 0) { + ret = len; + goto done; + } + + if (len == 0) + break; + + for (nh = (struct nlmsghdr *)iov.iov_base; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_flags & NLM_F_MULTI) + multipart = true; + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + goto done; + case NLMSG_DONE: + ret = 0; + goto done; + default: + break; + } + } + } + ret = 0; +done: + free(iov.iov_base); + return ret; +} + +int xsk_set_mtu(int ifindex, int mtu) +{ + struct nl_mtu_req req; + struct rtattr *rta; + int fd, ret; + + fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); + if (fd < 0) + return fd; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_NEWLINK; + req.msg.ifi_family = AF_UNSPEC; + req.msg.ifi_index = ifindex; + rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len)); + rta->rta_type = IFLA_MTU; + rta->rta_len = RTA_LENGTH(sizeof(unsigned int)); + req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + RTA_LENGTH(sizeof(mtu)); + memcpy(RTA_DATA(rta), &mtu, sizeof(mtu)); + + ret = send(fd, &req, req.nh.nlmsg_len, 0); + if (ret < 0) { + close(fd); + return errno; + } + + ret = netlink_recv(fd); + close(fd); + return ret; +} + int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags) { int prog_fd; diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h index 8da8d557768b4..d93200fdaa8d8 100644 --- a/tools/testing/selftests/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -239,6 +239,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, int xsk_umem__delete(struct xsk_umem *umem); void xsk_socket__delete(struct xsk_socket *xsk); +int xsk_set_mtu(int ifindex, int mtu); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index ae697a10a0569..29175682c44d6 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -53,6 +53,13 @@ test_exit() exit 1 } +cleanup_iface() +{ + ip link set $1 mtu $2 + ip link set $1 xdp off + ip link set $1 xdpgeneric off +} + clear_configs() { [ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] && diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 218d7f694e5cf..3ff4367066401 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -49,8 +49,11 @@ * h. tests for invalid and corner case Tx descriptors so that the correct ones * are discarded and let through, respectively. * i. 2K frame size tests - * - * Total tests: 12 + * j. If multi-buffer is supported, send 9k packets divided into 3 frames + * k. If multi-buffer and huge pages are supported, send 9k packets in a single frame + * using unaligned mode + * l. If multi-buffer is supported, try various nasty combinations of descriptors to + * check if they pass the validation or not * * Flow: * ----- @@ -73,10 +76,10 @@ #include #include #include -#include #include #include #include +#include #include #include #include @@ -91,7 +94,6 @@ #include #include #include -#include #include #include "xsk_xdp_progs.skel.h" @@ -253,6 +255,8 @@ static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_i cfg.bind_flags = ifobject->bind_flags; if (shared) cfg.bind_flags |= XDP_SHARED_UMEM; + if (ifobject->pkt_stream && ifobject->mtu > MAX_ETH_PKT_SIZE) + cfg.bind_flags |= XDP_USE_SG; txr = ifobject->tx_on ? &xsk->tx : NULL; rxr = ifobject->rx_on ? &xsk->rx : NULL; @@ -415,6 +419,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, test->total_steps = 1; test->nb_sockets = 1; test->fail = false; + test->mtu = MAX_ETH_PKT_SIZE; test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog; @@ -468,6 +473,26 @@ static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *x test->xskmap_tx = xskmap_tx; } +static int test_spec_set_mtu(struct test_spec *test, int mtu) +{ + int err; + + if (test->ifobj_rx->mtu != mtu) { + err = xsk_set_mtu(test->ifobj_rx->ifindex, mtu); + if (err) + return err; + test->ifobj_rx->mtu = mtu; + } + if (test->ifobj_tx->mtu != mtu) { + err = xsk_set_mtu(test->ifobj_tx->ifindex, mtu); + if (err) + return err; + test->ifobj_tx->mtu = mtu; + } + + return 0; +} + static void pkt_stream_reset(struct pkt_stream *pkt_stream) { if (pkt_stream) @@ -533,23 +558,49 @@ static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) return pkt_stream; } +static bool pkt_continues(u32 options) +{ + return options & XDP_PKT_CONTD; +} + static u32 ceil_u32(u32 a, u32 b) { return (a + b - 1) / b; } -static u32 pkt_nb_frags(u32 frame_size, struct pkt *pkt) +static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pkt *pkt) { - if (!pkt || !pkt->valid) + u32 nb_frags = 1, next_frag; + + if (!pkt) return 1; - return ceil_u32(pkt->len, frame_size); + + if (!pkt_stream->verbatim) { + if (!pkt->valid || !pkt->len) + return 1; + return ceil_u32(pkt->len, frame_size); + } + + /* Search for the end of the packet in verbatim mode */ + if (!pkt_continues(pkt->options)) + return nb_frags; + + next_frag = pkt_stream->current_pkt_nb; + pkt++; + while (next_frag++ < pkt_stream->nb_pkts) { + nb_frags++; + if (!pkt_continues(pkt->options) || !pkt->valid) + break; + pkt++; + } + return nb_frags; } static void pkt_set(struct xsk_umem_info *umem, struct pkt *pkt, int offset, u32 len) { pkt->offset = offset; pkt->len = len; - if (len > umem->frame_size - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 2 - umem->frame_headroom) + if (len > MAX_ETH_JUMBO_SIZE) pkt->valid = false; else pkt->valid = true; @@ -637,6 +688,11 @@ static u64 pkt_get_addr(struct pkt *pkt, struct xsk_umem_info *umem) return pkt->offset + umem_alloc_buffer(umem); } +static void pkt_stream_cancel(struct pkt_stream *pkt_stream) +{ + pkt_stream->current_pkt_nb--; +} + static void pkt_generate(struct ifobject *ifobject, u64 addr, u32 len, u32 pkt_nb, u32 bytes_written) { @@ -657,34 +713,59 @@ static void pkt_generate(struct ifobject *ifobject, u64 addr, u32 len, u32 pkt_n write_payload(data, pkt_nb, bytes_written, len); } -static void __pkt_stream_generate_custom(struct ifobject *ifobj, - struct pkt *pkts, u32 nb_pkts) +static struct pkt_stream *__pkt_stream_generate_custom(struct ifobject *ifobj, struct pkt *frames, + u32 nb_frames, bool verbatim) { + u32 i, len = 0, pkt_nb = 0, payload = 0; struct pkt_stream *pkt_stream; - u32 i; - pkt_stream = __pkt_stream_alloc(nb_pkts); + pkt_stream = __pkt_stream_alloc(nb_frames); if (!pkt_stream) exit_with_error(ENOMEM); - for (i = 0; i < nb_pkts; i++) { - struct pkt *pkt = &pkt_stream->pkts[i]; + for (i = 0; i < nb_frames; i++) { + struct pkt *pkt = &pkt_stream->pkts[pkt_nb]; + struct pkt *frame = &frames[i]; - pkt->offset = pkts[i].offset; - pkt->len = pkts[i].len; - pkt->pkt_nb = i; - pkt->valid = pkts[i].valid; - if (pkt->len > pkt_stream->max_pkt_len) + pkt->offset = frame->offset; + if (verbatim) { + *pkt = *frame; + pkt->pkt_nb = payload; + if (!frame->valid || !pkt_continues(frame->options)) + payload++; + } else { + if (frame->valid) + len += frame->len; + if (frame->valid && pkt_continues(frame->options)) + continue; + + pkt->pkt_nb = pkt_nb; + pkt->len = len; + pkt->valid = frame->valid; + pkt->options = 0; + + len = 0; + } + + if (pkt->valid && pkt->len > pkt_stream->max_pkt_len) pkt_stream->max_pkt_len = pkt->len; + pkt_nb++; } - ifobj->pkt_stream = pkt_stream; + pkt_stream->nb_pkts = pkt_nb; + pkt_stream->verbatim = verbatim; + return pkt_stream; } static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) { - __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts); - __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts); + struct pkt_stream *pkt_stream; + + pkt_stream = __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts, true); + test->ifobj_tx->pkt_stream = pkt_stream; + + pkt_stream = __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts, false); + test->ifobj_rx->pkt_stream = pkt_stream; } static void pkt_print_data(u32 *data, u32 cnt) @@ -765,43 +846,76 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) return true; } -static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len) +static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb, + u32 bytes_processed) { - void *data = xsk_umem__get_data(buffer, addr); - u32 seqnum, pkt_data; + u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum; + void *data = xsk_umem__get_data(umem->buffer, addr); - if (!pkt) { - ksft_print_msg("[%s] too many packets received\n", __func__); - goto error; + addr -= umem->base_addr; + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { + ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { + ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); + return false; } - if (len < MIN_PKT_SIZE || pkt->len < MIN_PKT_SIZE) { - /* Do not try to verify packets that are smaller than minimum size. */ - return true; + pkt_data = data; + if (!bytes_processed) { + pkt_data += PKT_HDR_SIZE / sizeof(*pkt_data); + len -= PKT_HDR_SIZE; + } else { + bytes_processed -= PKT_HDR_SIZE; } - if (pkt->len != len) { - ksft_print_msg("[%s] expected length [%d], got length [%d]\n", - __func__, pkt->len, len); + expected_seqnum = bytes_processed / sizeof(*pkt_data); + seqnum = ntohl(*pkt_data) & 0xffff; + pkt_nb = ntohl(*pkt_data) >> 16; + + if (expected_pkt_nb != pkt_nb) { + ksft_print_msg("[%s] expected pkt_nb [%u], got pkt_nb [%u]\n", + __func__, expected_pkt_nb, pkt_nb); + goto error; + } + if (expected_seqnum != seqnum) { + ksft_print_msg("[%s] expected seqnum at start [%u], got seqnum [%u]\n", + __func__, expected_seqnum, seqnum); goto error; } - pkt_data = ntohl(*((u32 *)(data + PKT_HDR_SIZE))); - seqnum = pkt_data >> 16; - - if (pkt->pkt_nb != seqnum) { - ksft_print_msg("[%s] expected seqnum [%d], got seqnum [%d]\n", - __func__, pkt->pkt_nb, seqnum); + words_to_end = len / sizeof(*pkt_data) - 1; + pkt_data += words_to_end; + seqnum = ntohl(*pkt_data) & 0xffff; + expected_seqnum += words_to_end; + if (expected_seqnum != seqnum) { + ksft_print_msg("[%s] expected seqnum at end [%u], got seqnum [%u]\n", + __func__, expected_seqnum, seqnum); goto error; } return true; error: - pkt_dump(data, len, true); + pkt_dump(data, len, !bytes_processed); return false; } +static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len) +{ + if (pkt->len != len) { + ksft_print_msg("[%s] expected packet length [%d], got length [%d]\n", + __func__, pkt->len, len); + pkt_dump(xsk_umem__get_data(buffer, addr), len, true); + return false; + } + + return true; +} + static void kick_tx(struct xsk_socket_info *xsk) { int ret; @@ -854,8 +968,8 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds) { struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; struct pkt_stream *pkt_stream = test->ifobj_rx->pkt_stream; - u32 idx_rx = 0, idx_fq = 0, rcvd, i, pkts_sent = 0; struct xsk_socket_info *xsk = test->ifobj_rx->xsk; + u32 idx_rx = 0, idx_fq = 0, rcvd, pkts_sent = 0; struct ifobject *ifobj = test->ifobj_rx; struct xsk_umem_info *umem = xsk->umem; struct pkt *pkt; @@ -868,6 +982,9 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds) pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); while (pkt) { + u32 frags_processed = 0, nb_frags = 0, pkt_len = 0; + u64 first_addr; + ret = gettimeofday(&tv_now, NULL); if (ret) exit_with_error(errno); @@ -888,7 +1005,6 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds) ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__); return TEST_FAILURE; - } if (!(fds->revents & POLLIN)) @@ -913,27 +1029,59 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds) } } - for (i = 0; i < rcvd; i++) { + while (frags_processed < rcvd) { const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); u64 addr = desc->addr, orig; orig = xsk_umem__extract_addr(addr); addr = xsk_umem__add_offset_to_addr(addr); - if (!is_pkt_valid(pkt, umem->buffer, addr, desc->len) || + if (!pkt) { + ksft_print_msg("[%s] received too many packets addr: %lx len %u\n", + __func__, addr, desc->len); + return TEST_FAILURE; + } + + if (!is_frag_valid(umem, addr, desc->len, pkt->pkt_nb, pkt_len) || !is_offset_correct(umem, pkt, addr) || (ifobj->use_metadata && !is_metadata_correct(pkt, umem->buffer, addr))) return TEST_FAILURE; + if (!nb_frags++) + first_addr = addr; + frags_processed++; + pkt_len += desc->len; if (ifobj->use_fill_ring) *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig; + + if (pkt_continues(desc->options)) + continue; + + /* The complete packet has been received */ + if (!is_pkt_valid(pkt, umem->buffer, first_addr, pkt_len) || + !is_offset_correct(umem, pkt, addr)) + return TEST_FAILURE; + pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); + nb_frags = 0; + pkt_len = 0; + } + + if (nb_frags) { + /* In the middle of a packet. Start over from beginning of packet. */ + idx_rx -= nb_frags; + xsk_ring_cons__cancel(&xsk->rx, nb_frags); + if (ifobj->use_fill_ring) { + idx_fq -= nb_frags; + xsk_ring_prod__cancel(&umem->fq, nb_frags); + } + frags_processed -= nb_frags; } if (ifobj->use_fill_ring) - xsk_ring_prod__submit(&umem->fq, rcvd); + xsk_ring_prod__submit(&umem->fq, frags_processed); if (ifobj->release_rx) - xsk_ring_cons__release(&xsk->rx, rcvd); + xsk_ring_cons__release(&xsk->rx, frags_processed); pthread_mutex_lock(&pacing_mutex); pkts_in_flight -= pkts_sent; @@ -946,13 +1094,14 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds) static int __send_pkts(struct ifobject *ifobject, struct pollfd *fds, bool timeout) { + u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len; + struct pkt_stream *pkt_stream = ifobject->pkt_stream; struct xsk_socket_info *xsk = ifobject->xsk; struct xsk_umem_info *umem = ifobject->umem; - u32 i, idx = 0, valid_pkts = 0, buffer_len; bool use_poll = ifobject->use_poll; int ret; - buffer_len = pkt_get_buffer_len(umem, ifobject->pkt_stream->max_pkt_len); + buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); /* pkts_in_flight might be negative if many invalid packets are sent */ if (pkts_in_flight >= (int)((umem_size(umem) - BATCH_SIZE * buffer_len) / buffer_len)) { kick_tx(xsk); @@ -983,17 +1132,49 @@ static int __send_pkts(struct ifobject *ifobject, struct pollfd *fds, bool timeo } for (i = 0; i < BATCH_SIZE; i++) { - struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); - struct pkt *pkt = pkt_stream_get_next_tx_pkt(ifobject->pkt_stream); + struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); + u32 nb_frags_left, nb_frags, bytes_written = 0; if (!pkt) break; - tx_desc->addr = pkt_get_addr(pkt, umem); - tx_desc->len = pkt->len; - if (pkt->valid) { + nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); + if (nb_frags > BATCH_SIZE - i) { + pkt_stream_cancel(pkt_stream); + xsk_ring_prod__cancel(&xsk->tx, BATCH_SIZE - i); + break; + } + nb_frags_left = nb_frags; + + while (nb_frags_left--) { + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); + + tx_desc->addr = pkt_get_addr(pkt, ifobject->umem); + if (pkt_stream->verbatim) { + tx_desc->len = pkt->len; + tx_desc->options = pkt->options; + } else if (nb_frags_left) { + tx_desc->len = umem->frame_size; + tx_desc->options = XDP_PKT_CONTD; + } else { + tx_desc->len = pkt->len - bytes_written; + tx_desc->options = 0; + } + if (pkt->valid) + pkt_generate(ifobject, tx_desc->addr, tx_desc->len, pkt->pkt_nb, + bytes_written); + bytes_written += tx_desc->len; + + if (nb_frags_left) { + i++; + if (pkt_stream->verbatim) + pkt = pkt_stream_get_next_tx_pkt(pkt_stream); + } + } + + if (pkt && pkt->valid) { valid_pkts++; - pkt_generate(ifobject, tx_desc->addr, tx_desc->len, pkt->pkt_nb, 0); + valid_frags += nb_frags; } } @@ -1002,7 +1183,7 @@ static int __send_pkts(struct ifobject *ifobject, struct pollfd *fds, bool timeo pthread_mutex_unlock(&pacing_mutex); xsk_ring_prod__submit(&xsk->tx, i); - xsk->outstanding_tx += valid_pkts; + xsk->outstanding_tx += valid_frags; if (use_poll) { ret = poll(fds, 1, POLL_TMOUT); @@ -1222,7 +1403,7 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream u64 addr; u32 i; - for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt); i++) { + for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt_stream, pkt); i++) { if (!pkt) { if (!fill_up) break; @@ -1415,6 +1596,25 @@ static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *i struct ifobject *ifobj2) { pthread_t t0, t1; + int err; + + if (test->mtu > MAX_ETH_PKT_SIZE) { + if (test->mode == TEST_MODE_ZC && (!ifobj1->multi_buff_zc_supp || + (ifobj2 && !ifobj2->multi_buff_zc_supp))) { + ksft_test_result_skip("Multi buffer for zero-copy not supported.\n"); + return TEST_SKIP; + } + if (test->mode != TEST_MODE_ZC && (!ifobj1->multi_buff_supp || + (ifobj2 && !ifobj2->multi_buff_supp))) { + ksft_test_result_skip("Multi buffer not supported.\n"); + return TEST_SKIP; + } + } + err = test_spec_set_mtu(test, test->mtu); + if (err) { + ksft_print_msg("Error, could not set mtu.\n"); + exit_with_error(err); + } if (ifobj2) { if (pthread_barrier_init(&barr, NULL, 2)) @@ -1616,6 +1816,16 @@ static int testapp_unaligned(struct test_spec *test) return testapp_validate_traffic(test); } +static int testapp_unaligned_mb(struct test_spec *test) +{ + test_spec_set_name(test, "UNALIGNED_MODE_9K"); + test->mtu = MAX_ETH_JUMBO_SIZE; + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE); + return testapp_validate_traffic(test); +} + static int testapp_single_pkt(struct test_spec *test) { struct pkt pkts[] = {{0, MIN_PKT_SIZE, 0, true}}; @@ -1624,6 +1834,55 @@ static int testapp_single_pkt(struct test_spec *test) return testapp_validate_traffic(test); } +static int testapp_multi_buffer(struct test_spec *test) +{ + test_spec_set_name(test, "RUN_TO_COMPLETION_9K_PACKETS"); + test->mtu = MAX_ETH_JUMBO_SIZE; + pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE); + + return testapp_validate_traffic(test); +} + +static int testapp_invalid_desc_mb(struct test_spec *test) +{ + struct xsk_umem_info *umem = test->ifobj_tx->umem; + u64 umem_size = umem->num_frames * umem->frame_size; + struct pkt pkts[] = { + /* Valid packet for synch to start with */ + {0, MIN_PKT_SIZE, 0, true, 0}, + /* Zero frame len is not legal */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, 0, 0, false, 0}, + /* Invalid address in the second frame */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + /* Invalid len in the middle */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + /* Invalid options in the middle */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XSK_DESC__INVALID_OPTION}, + /* Transmit 2 frags, receive 3 */ + {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, XDP_PKT_CONTD}, + {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, 0}, + /* Middle frame crosses chunk boundary with small length */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {-MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false, 0}, + /* Valid packet for synch so that something is received */ + {0, MIN_PKT_SIZE, 0, true, 0}}; + + if (umem->unaligned_mode) { + /* Crossing a chunk boundary allowed */ + pkts[12].valid = true; + pkts[13].valid = true; + } + + test->mtu = MAX_ETH_JUMBO_SIZE; + pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); + return testapp_validate_traffic(test); +} + static int testapp_invalid_desc(struct test_spec *test) { struct xsk_umem_info *umem = test->ifobj_tx->umem; @@ -1690,7 +1949,6 @@ static int testapp_xdp_metadata_count(struct test_spec *test) int count = 0; int key = 0; - test_spec_set_name(test, "XDP_METADATA_COUNT"); test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata, skel_tx->progs.xsk_xdp_populate_metadata, skel_rx->maps.xsk, skel_tx->maps.xsk); @@ -1724,6 +1982,48 @@ static int testapp_poll_rxq_tmout(struct test_spec *test) return testapp_validate_traffic_single_thread(test, test->ifobj_rx); } +static int testapp_too_many_frags(struct test_spec *test) +{ + struct pkt pkts[2 * XSK_DESC__MAX_SKB_FRAGS + 2] = {}; + u32 max_frags, i; + + test_spec_set_name(test, "TOO_MANY_FRAGS"); + if (test->mode == TEST_MODE_ZC) + max_frags = test->ifobj_tx->xdp_zc_max_segs; + else + max_frags = XSK_DESC__MAX_SKB_FRAGS; + + test->mtu = MAX_ETH_JUMBO_SIZE; + + /* Valid packet for synch */ + pkts[0].len = MIN_PKT_SIZE; + pkts[0].valid = true; + + /* One valid packet with the max amount of frags */ + for (i = 1; i < max_frags + 1; i++) { + pkts[i].len = MIN_PKT_SIZE; + pkts[i].options = XDP_PKT_CONTD; + pkts[i].valid = true; + } + pkts[max_frags].options = 0; + + /* An invalid packet with the max amount of frags but signals packet + * continues on the last frag + */ + for (i = max_frags + 1; i < 2 * max_frags + 1; i++) { + pkts[i].len = MIN_PKT_SIZE; + pkts[i].options = XDP_PKT_CONTD; + pkts[i].valid = false; + } + + /* Valid packet for synch */ + pkts[2 * max_frags + 1].len = MIN_PKT_SIZE; + pkts[2 * max_frags + 1].valid = true; + + pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2); + return testapp_validate_traffic(test); +} + static int xsk_load_xdp_programs(struct ifobject *ifobj) { ifobj->xdp_progs = xsk_xdp_progs__open_and_load(); @@ -1757,6 +2057,7 @@ static bool hugepages_present(void) static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *src_mac, thread_func_t func_ptr) { + LIBBPF_OPTS(bpf_xdp_query_opts, query_opts); int err; memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN); @@ -1772,6 +2073,22 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char * if (hugepages_present()) ifobj->unaligned_supp = true; + + err = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &query_opts); + if (err) { + ksft_print_msg("Error querrying XDP capabilities\n"); + exit_with_error(-err); + } + if (query_opts.feature_flags & NETDEV_XDP_ACT_RX_SG) + ifobj->multi_buff_supp = true; + if (query_opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (query_opts.xdp_zc_max_segs > 1) { + ifobj->multi_buff_zc_supp = true; + ifobj->xdp_zc_max_segs = query_opts.xdp_zc_max_segs; + } else { + ifobj->xdp_zc_max_segs = 0; + } + } } static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_type type) @@ -1804,6 +2121,9 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_ test_spec_set_name(test, "RUN_TO_COMPLETION"); ret = testapp_validate_traffic(test); break; + case TEST_TYPE_RUN_TO_COMPLETION_MB: + ret = testapp_multi_buffer(test); + break; case TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT: test_spec_set_name(test, "RUN_TO_COMPLETION_SINGLE_PKT"); ret = testapp_single_pkt(test); @@ -1866,9 +2186,22 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_ ret = testapp_invalid_desc(test); break; } + case TEST_TYPE_ALIGNED_INV_DESC_MB: + test_spec_set_name(test, "ALIGNED_INV_DESC_MULTI_BUFF"); + ret = testapp_invalid_desc_mb(test); + break; + case TEST_TYPE_UNALIGNED_INV_DESC_MB: + test_spec_set_name(test, "UNALIGNED_INV_DESC_MULTI_BUFF"); + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + ret = testapp_invalid_desc_mb(test); + break; case TEST_TYPE_UNALIGNED: ret = testapp_unaligned(test); break; + case TEST_TYPE_UNALIGNED_MB: + ret = testapp_unaligned_mb(test); + break; case TEST_TYPE_HEADROOM: ret = testapp_headroom(test); break; @@ -1876,8 +2209,17 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_ ret = testapp_xdp_drop(test); break; case TEST_TYPE_XDP_METADATA_COUNT: + test_spec_set_name(test, "XDP_METADATA_COUNT"); + ret = testapp_xdp_metadata_count(test); + break; + case TEST_TYPE_XDP_METADATA_COUNT_MB: + test_spec_set_name(test, "XDP_METADATA_COUNT_MULTI_BUFF"); + test->mtu = MAX_ETH_JUMBO_SIZE; ret = testapp_xdp_metadata_count(test); break; + case TEST_TYPE_TOO_MANY_FRAGS: + ret = testapp_too_many_frags(test); + break; default: break; } diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index aaf27e0676406..233b66cef64a5 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -38,6 +38,8 @@ #define MAX_TEARDOWN_ITER 10 #define PKT_HDR_SIZE (sizeof(struct ethhdr) + 2) /* Just to align the data in the packet */ #define MIN_PKT_SIZE 64 +#define MAX_ETH_PKT_SIZE 1518 +#define MAX_ETH_JUMBO_SIZE 9000 #define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 @@ -47,7 +49,11 @@ #define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4) #define RX_FULL_RXQSIZE 32 #define UMEM_HEADROOM_TEST_SIZE 128 -#define XSK_UMEM__INVALID_FRAME_SIZE (XSK_UMEM__DEFAULT_FRAME_SIZE + 1) +#define XSK_UMEM__INVALID_FRAME_SIZE (MAX_ETH_JUMBO_SIZE + 1) +#define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024) +#define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024) +#define XSK_DESC__INVALID_OPTION (0xffff) +#define XSK_DESC__MAX_SKB_FRAGS 18 #define HUGEPAGE_SIZE (2 * 1024 * 1024) #define PKT_DUMP_NB_TO_PRINT 16 @@ -83,6 +89,12 @@ enum test_type { TEST_TYPE_BPF_RES, TEST_TYPE_XDP_DROP_HALF, TEST_TYPE_XDP_METADATA_COUNT, + TEST_TYPE_XDP_METADATA_COUNT_MB, + TEST_TYPE_RUN_TO_COMPLETION_MB, + TEST_TYPE_UNALIGNED_MB, + TEST_TYPE_ALIGNED_INV_DESC_MB, + TEST_TYPE_UNALIGNED_INV_DESC_MB, + TEST_TYPE_TOO_MANY_FRAGS, TEST_TYPE_MAX }; @@ -115,6 +127,7 @@ struct pkt { u32 len; u32 pkt_nb; bool valid; + u16 options; }; struct pkt_stream { @@ -122,6 +135,7 @@ struct pkt_stream { u32 current_pkt_nb; struct pkt *pkts; u32 max_pkt_len; + bool verbatim; }; struct ifobject; @@ -141,7 +155,9 @@ struct ifobject { struct bpf_program *xdp_prog; enum test_mode mode; int ifindex; + int mtu; u32 bind_flags; + u32 xdp_zc_max_segs; bool tx_on; bool rx_on; bool use_poll; @@ -151,6 +167,8 @@ struct ifobject { bool shared_umem; bool use_metadata; bool unaligned_supp; + bool multi_buff_supp; + bool multi_buff_zc_supp; u8 dst_mac[ETH_ALEN]; u8 src_mac[ETH_ALEN]; }; @@ -164,6 +182,7 @@ struct test_spec { struct bpf_program *xdp_prog_tx; struct bpf_map *xskmap_rx; struct bpf_map *xskmap_tx; + int mtu; u16 total_steps; u16 current_step; u16 nb_sockets;