Skip to content

Commit

Permalink
xsk: Add TX timestamp and TX checksum offload support
Browse files Browse the repository at this point in the history
This change actually defines the (initial) metadata layout
that should be used by AF_XDP userspace (xsk_tx_metadata).
The first field is flags which requests appropriate offloads,
followed by the offload-specific fields. The supported per-device
offloads are exported via netlink (new xsk-flags).

The offloads themselves are still implemented in a bit of a
framework-y fashion that's left from my initial kfunc attempt.
I'm introducing new xsk_tx_metadata_ops which drivers are
supposed to implement. The drivers are also supposed
to call xsk_tx_metadata_request/xsk_tx_metadata_complete in
the right places. Since xsk_tx_metadata_{request,_complete}
are static inline, we don't incur any extra overhead doing
indirect calls.

The benefit of this scheme is as follows:
- keeps all metadata layout parsing away from driver code
- makes it easy to grep and see which drivers implement what
- don't need any extra flags to maintain to keep track of what
  offloads are implemented; if the callback is implemented - the offload
  is supported (used by netlink reporting code)

Two offloads are defined right now:
1. XDP_TXMD_FLAGS_CHECKSUM: skb-style csum_start+csum_offset
2. XDP_TXMD_FLAGS_TIMESTAMP: writes TX timestamp back into metadata
   area upon completion (tx_timestamp field)

XDP_TXMD_FLAGS_TIMESTAMP is also implemented for XDP_COPY mode: it writes
SW timestamp from the skb destructor (note I'm reusing hwtstamps to pass
metadata pointer).

The struct is forward-compatible and can be extended in the future
by appending more fields.

Reviewed-by: Song Yoong Siang <yoong.siang.song@intel.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20231127190319.1190813-3-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
  • Loading branch information
Stanislav Fomichev authored and Alexei Starovoitov committed Nov 29, 2023
1 parent 341ac98 commit 48eb03d
Show file tree
Hide file tree
Showing 15 changed files with 348 additions and 9 deletions.
19 changes: 18 additions & 1 deletion Documentation/netlink/specs/netdev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ definitions:
-
type: flags
name: xdp-rx-metadata
render-max: true
entries:
-
name: timestamp
Expand All @@ -55,6 +54,18 @@ definitions:
name: hash
doc:
Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash().
-
type: flags
name: xsk-flags
entries:
-
name: tx-timestamp
doc:
HW timestamping egress packets is supported by the driver.
-
name: tx-checksum
doc:
L3 checksum HW offload is supported by the driver.

attribute-sets:
-
Expand Down Expand Up @@ -86,6 +97,11 @@ attribute-sets:
See Documentation/networking/xdp-rx-metadata.rst for more details.
type: u64
enum: xdp-rx-metadata
-
name: xsk-features
doc: Bitmask of enabled AF_XDP features.
type: u64
enum: xsk-flags

operations:
list:
Expand All @@ -103,6 +119,7 @@ operations:
- xdp-features
- xdp-zc-max-segs
- xdp-rx-metadata-features
- xsk-features
dump:
reply: *dev-all
-
Expand Down
2 changes: 2 additions & 0 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -1865,6 +1865,7 @@ enum netdev_stat_type {
* @netdev_ops: Includes several pointers to callbacks,
* if one wants to override the ndo_*() functions
* @xdp_metadata_ops: Includes pointers to XDP metadata callbacks.
* @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks.
* @ethtool_ops: Management operations
* @l3mdev_ops: Layer 3 master device operations
* @ndisc_ops: Includes callbacks for different IPv6 neighbour
Expand Down Expand Up @@ -2128,6 +2129,7 @@ struct net_device {
unsigned long long priv_flags;
const struct net_device_ops *netdev_ops;
const struct xdp_metadata_ops *xdp_metadata_ops;
const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
int ifindex;
unsigned short gflags;
unsigned short hard_header_len;
Expand Down
14 changes: 13 additions & 1 deletion include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,15 @@ struct ubuf_info_msgzc {
int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);

/* Preserve some data across TX submission and completion.
*
* Note, this state is stored in the driver. Extending the layout
* might need some special care.
*/
struct xsk_tx_metadata_compl {
__u64 *tx_timestamp;
};

/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
Expand All @@ -578,7 +587,10 @@ struct skb_shared_info {
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
struct sk_buff *frag_list;
struct skb_shared_hwtstamps hwtstamps;
union {
struct skb_shared_hwtstamps hwtstamps;
struct xsk_tx_metadata_compl xsk_meta;
};
unsigned int gso_type;
u32 tskey;

Expand Down
110 changes: 110 additions & 0 deletions include/net/xdp_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,105 @@ struct xdp_sock {
struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
};

/*
* AF_XDP TX metadata hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are
* optional and can be filled with a null pointer.
*
* void (*tmo_request_timestamp)(void *priv)
* Called when AF_XDP frame requested egress timestamp.
*
* u64 (*tmo_fill_timestamp)(void *priv)
* Called when AF_XDP frame, that had requested egress timestamp,
* received a completion. The hook needs to return the actual HW timestamp.
*
* void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv)
* Called when AF_XDP frame requested HW checksum offload. csum_start
* indicates position where checksumming should start.
* csum_offset indicates position where checksum should be stored.
*
*/
struct xsk_tx_metadata_ops {
void (*tmo_request_timestamp)(void *priv);
u64 (*tmo_fill_timestamp)(void *priv);
void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv);
};

#ifdef CONFIG_XDP_SOCKETS

int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
void __xsk_map_flush(void);

/**
* xsk_tx_metadata_to_compl - Save enough relevant metadata information
* to perform tx completion in the future.
* @meta: pointer to AF_XDP metadata area
* @compl: pointer to output struct xsk_tx_metadata_to_compl
*
* This function should be called by the networking device when
* it prepares AF_XDP egress packet. The value of @compl should be stored
* and passed to xsk_tx_metadata_complete upon TX completion.
*/
static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
struct xsk_tx_metadata_compl *compl)
{
if (!meta)
return;

if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
compl->tx_timestamp = &meta->completion.tx_timestamp;
else
compl->tx_timestamp = NULL;
}

/**
* xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission
* and call appropriate xsk_tx_metadata_ops operation.
* @meta: pointer to AF_XDP metadata area
* @ops: pointer to struct xsk_tx_metadata_ops
* @priv: pointer to driver-private aread
*
* This function should be called by the networking device when
* it prepares AF_XDP egress packet.
*/
static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta,
const struct xsk_tx_metadata_ops *ops,
void *priv)
{
if (!meta)
return;

if (ops->tmo_request_timestamp)
if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
ops->tmo_request_timestamp(priv);

if (ops->tmo_request_checksum)
if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM)
ops->tmo_request_checksum(meta->request.csum_start,
meta->request.csum_offset, priv);
}

/**
* xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion
* and call appropriate xsk_tx_metadata_ops operation.
* @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl
* @ops: pointer to struct xsk_tx_metadata_ops
* @priv: pointer to driver-private aread
*
* This function should be called by the networking device upon
* AF_XDP egress completion.
*/
static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
const struct xsk_tx_metadata_ops *ops,
void *priv)
{
if (!compl)
return;

*compl->tx_timestamp = ops->tmo_fill_timestamp(priv);
}

#else

static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
Expand All @@ -115,6 +208,23 @@ static inline void __xsk_map_flush(void)
{
}

static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
struct xsk_tx_metadata_compl *compl)
{
}

static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta,
const struct xsk_tx_metadata_ops *ops,
void *priv)
{
}

static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
const struct xsk_tx_metadata_ops *ops,
void *priv)
{
}

#endif /* CONFIG_XDP_SOCKETS */

#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
Expand Down
13 changes: 13 additions & 0 deletions include/net/xdp_sock_drv.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,14 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return xp_raw_get_data(pool, addr);
}

static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
{
if (!pool->tx_metadata_len)
return NULL;

return xp_raw_get_data(pool, addr) - pool->tx_metadata_len;
}

static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
Expand Down Expand Up @@ -324,6 +332,11 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return NULL;
}

static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
{
return NULL;
}

static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
}
Expand Down
6 changes: 6 additions & 0 deletions include/net/xsk_buff_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ struct xdp_buff_xsk {
};

#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb))
#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t))

struct xsk_dma_map {
dma_addr_t *dma_pages;
Expand Down Expand Up @@ -234,4 +235,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb)
return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
}

static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool)
{
return pool->tx_metadata_len > 0;
}

#endif /* XSK_BUFF_POOL_H_ */
38 changes: 38 additions & 0 deletions include/uapi/linux/if_xdp.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,41 @@ struct xdp_options {
#define XSK_UNALIGNED_BUF_ADDR_MASK \
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)

/* Request transmit timestamp. Upon completion, put it into tx_timestamp
* field of struct xsk_tx_metadata.
*/
#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0)

/* Request transmit checksum offload. Checksum start position and offset
* are communicated via csum_start and csum_offset fields of struct
* xsk_tx_metadata.
*/
#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)

/* AF_XDP offloads request. 'request' union member is consumed by the driver
* when the packet is being transmitted. 'completion' union member is
* filled by the driver when the transmit completion arrives.
*/
struct xsk_tx_metadata {
__u64 flags;

union {
struct {
/* XDP_TXMD_FLAGS_CHECKSUM */

/* Offset from desc->addr where checksumming should start. */
__u16 csum_start;
/* Offset from csum_start where checksum should be stored. */
__u16 csum_offset;
} request;

struct {
/* XDP_TXMD_FLAGS_TIMESTAMP */
__u64 tx_timestamp;
} completion;
};
};

/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;
Expand All @@ -122,4 +157,7 @@ struct xdp_desc {
*/
#define XDP_PKT_CONTD (1 << 0)

/* TX packet carries valid metadata. */
#define XDP_TX_METADATA (1 << 1)

#endif /* _LINUX_IF_XDP_H */
16 changes: 16 additions & 0 deletions include/uapi/linux/netdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,28 @@ enum netdev_xdp_rx_metadata {
NETDEV_XDP_RX_METADATA_MASK = 3,
};

/**
* enum netdev_xsk_flags
* @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported
* by the driver.
* @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
* driver.
*/
enum netdev_xsk_flags {
NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,

/* private: */
NETDEV_XSK_FLAGS_MASK = 3,
};

enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
NETDEV_A_DEV_XDP_FEATURES,
NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
NETDEV_A_DEV_XSK_FEATURES,

__NETDEV_A_DEV_MAX,
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
Expand Down
13 changes: 12 additions & 1 deletion net/core/netdev-genl.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <net/xdp_sock.h>

#include "netdev-genl-gen.h"

static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
u64 xsk_features = 0;
u64 xdp_rx_meta = 0;
void *hdr;

Expand All @@ -26,11 +28,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC

if (netdev->xsk_tx_metadata_ops) {
if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp)
xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
}

if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
netdev->xdp_features, NETDEV_A_DEV_PAD) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
xdp_rx_meta, NETDEV_A_DEV_PAD)) {
xdp_rx_meta, NETDEV_A_DEV_PAD) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
xsk_features, NETDEV_A_DEV_PAD)) {
genlmsg_cancel(rsp, hdr);
return -EINVAL;
}
Expand Down
Loading

0 comments on commit 48eb03d

Please sign in to comment.