Skip to content

Commit

Permalink
Merge branch 'bpf-metadata-direct-access'
Browse files Browse the repository at this point in the history
Daniel Borkmann says:

====================
BPF metadata for direct access

This work enables generic transfer of metadata from XDP into skb,
meaning the packet has a flexible and programmable room for meta
data, which can later be used by BPF to set various skb members
when passing up the stack. For details, please see second patch.
Support has been implemented and tested with two drivers, and
should be straight forward to add to other drivers as well which
properly support head adjustment already.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Sep 26, 2017
2 parents 3bd3b9e + 366a88f commit 390e96e
Show file tree
Hide file tree
Showing 29 changed files with 759 additions and 104 deletions.
1 change: 1 addition & 0 deletions drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,

xdp.data_hard_start = *data_ptr - offset;
xdp.data = *data_ptr;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = *data_ptr + *len;
orig_data = xdp.data;
mapping = rx_buf->mapping - bp->rx_dma_offset;
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/cavium/thunder/nicvf_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,

xdp.data_hard_start = page_address(page);
xdp.data = (void *)cpu_addr;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
orig_data = xdp.data;

Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/intel/i40e/i40e_txrx.c
Original file line number Diff line number Diff line change
Expand Up @@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
if (!skb) {
xdp.data = page_address(rx_buffer->page) +
rx_buffer->page_offset;
xdp_set_data_meta_invalid(&xdp);
xdp.data_hard_start = xdp.data -
i40e_rx_offset(rx_ring);
xdp.data_end = xdp.data + size;
Expand Down
29 changes: 26 additions & 3 deletions drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2133,6 +2133,21 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
#if L1_CACHE_BYTES < 128
prefetch(xdp->data + L1_CACHE_BYTES);
#endif
/* Note, we get here by enabling legacy-rx via:
*
* ethtool --set-priv-flags <dev> legacy-rx on
*
* In this mode, we currently get 0 extra XDP headroom as
* opposed to having legacy-rx off, where we process XDP
* packets going to stack via ixgbe_build_skb(). The latter
* provides us currently with 192 bytes of headroom.
*
* For ixgbe_construct_skb() mode it means that the
* xdp->data_meta will always point to xdp->data, since
* the helper cannot expand the head. Should this ever
* change in future for legacy-rx mode on, then lets also
* add xdp->data_meta handling here.
*/

/* allocate a skb to store the frags */
skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBE_RX_HDR_SIZE);
Expand Down Expand Up @@ -2165,6 +2180,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
struct xdp_buff *xdp,
union ixgbe_adv_rx_desc *rx_desc)
{
unsigned int metasize = xdp->data - xdp->data_meta;
#if (PAGE_SIZE < 8192)
unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
#else
Expand All @@ -2174,10 +2190,14 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
#endif
struct sk_buff *skb;

/* prefetch first cache line of first page */
prefetch(xdp->data);
/* Prefetch first cache line of first page. If xdp->data_meta
* is unused, this points extactly as xdp->data, otherwise we
* likely have a consumer accessing first few bytes of meta
* data, and then actual data.
*/
prefetch(xdp->data_meta);
#if L1_CACHE_BYTES < 128
prefetch(xdp->data + L1_CACHE_BYTES);
prefetch(xdp->data_meta + L1_CACHE_BYTES);
#endif

/* build an skb to around the page buffer */
Expand All @@ -2188,6 +2208,8 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
/* update pointers within the skb to store the data */
skb_reserve(skb, xdp->data - xdp->data_hard_start);
__skb_put(skb, xdp->data_end - xdp->data);
if (metasize)
skb_metadata_set(skb, metasize);

/* record DMA address if this is the start of a chain of buffers */
if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
Expand Down Expand Up @@ -2326,6 +2348,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
if (!skb) {
xdp.data = page_address(rx_buffer->page) +
rx_buffer->page_offset;
xdp.data_meta = xdp.data;
xdp.data_hard_start = xdp.data -
ixgbe_rx_offset(rx_ring);
xdp.data_end = xdp.data + size;
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/mellanox/mlx4/en_rx.c
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud

xdp.data_hard_start = va - frags[0].page_offset;
xdp.data = va;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + length;
orig_data = xdp.data;

Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
return false;

xdp.data = va + *rx_headroom;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;
xdp.data_hard_start = va;

Expand Down
39 changes: 15 additions & 24 deletions drivers/net/ethernet/netronome/nfp/nfp_net_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -1574,26 +1574,6 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
return true;
}

static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
unsigned int *off, unsigned int *len)
{
struct xdp_buff xdp;
void *orig_data;
int ret;

xdp.data_hard_start = hard_start;
xdp.data = data + *off;
xdp.data_end = data + *off + *len;

orig_data = xdp.data;
ret = bpf_prog_run_xdp(prog, &xdp);

*len -= xdp.data - orig_data;
*off += xdp.data - orig_data;

return ret;
}

/**
* nfp_net_rx() - receive up to @budget packets on @rx_ring
* @rx_ring: RX ring to receive from
Expand Down Expand Up @@ -1629,6 +1609,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
struct nfp_meta_parsed meta;
struct net_device *netdev;
dma_addr_t new_dma_addr;
u32 meta_len_xdp = 0;
void *new_frag;

idx = D_IDX(rx_ring, rx_ring->rd_p);
Expand Down Expand Up @@ -1707,16 +1688,24 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)

if (xdp_prog && !(rxd->rxd.flags & PCIE_DESC_RX_BPF &&
dp->bpf_offload_xdp) && !meta.portid) {
void *orig_data = rxbuf->frag + pkt_off;
unsigned int dma_off;
void *hard_start;
struct xdp_buff xdp;
int act;

hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
xdp.data = orig_data;
xdp.data_meta = orig_data;
xdp.data_end = orig_data + pkt_len;

act = bpf_prog_run_xdp(xdp_prog, &xdp);

pkt_len -= xdp.data - orig_data;
pkt_off += xdp.data - orig_data;

act = nfp_net_run_xdp(xdp_prog, rxbuf->frag, hard_start,
&pkt_off, &pkt_len);
switch (act) {
case XDP_PASS:
meta_len_xdp = xdp.data - xdp.data_meta;
break;
case XDP_TX:
dma_off = pkt_off - NFP_NET_RX_BUF_HEADROOM;
Expand Down Expand Up @@ -1784,6 +1773,8 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
if (rxd->rxd.flags & PCIE_DESC_RX_VLAN)
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
le16_to_cpu(rxd->rxd.vlan));
if (meta_len_xdp)
skb_metadata_set(skb, meta_len_xdp);

napi_gro_receive(&rx_ring->r_vec->napi, skb);
}
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/qlogic/qede/qede_fp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,

xdp.data_hard_start = page_address(bd->data);
xdp.data = xdp.data_hard_start + *data_offset;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;

/* Queues always have a full reset currently, so for the time
Expand Down
1 change: 1 addition & 0 deletions drivers/net/tun.c
Original file line number Diff line number Diff line change
Expand Up @@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,

xdp.data_hard_start = buf;
xdp.data = buf + pad;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
Expand Down
2 changes: 2 additions & 0 deletions drivers/net/virtio_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,

xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
xdp.data = xdp.data_hard_start + xdp_headroom;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
Expand Down Expand Up @@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
data = page_address(xdp_page) + offset;
xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
xdp.data = data + vi->hdr_len;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + (len - vi->hdr_len);
act = bpf_prog_run_xdp(xdp_prog, &xdp);

Expand Down
1 change: 1 addition & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ enum bpf_reg_type {
PTR_TO_MAP_VALUE, /* reg points to map element value */
PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
PTR_TO_STACK, /* reg == frame_pointer + offset */
PTR_TO_PACKET_META, /* skb->data - meta_len */
PTR_TO_PACKET, /* reg points to skb->data */
PTR_TO_PACKET_END, /* skb->data + headlen */
};
Expand Down
30 changes: 25 additions & 5 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -487,24 +487,30 @@ struct sk_filter {

struct bpf_skb_data_end {
struct qdisc_skb_cb qdisc_cb;
void *data_meta;
void *data_end;
};

struct xdp_buff {
void *data;
void *data_end;
void *data_meta;
void *data_hard_start;
};

/* compute the linear packet data range [data, data_end) which
* will be accessed by cls_bpf, act_bpf and lwt programs
/* Compute the linear packet data range [data, data_end) which
* will be accessed by various program types (cls_bpf, act_bpf,
* lwt, ...). Subsystems allowing direct data access must (!)
* ensure that cb[] area can be written to when BPF program is
* invoked (otherwise cb[] save/restore is necessary).
*/
static inline void bpf_compute_data_end(struct sk_buff *skb)
static inline void bpf_compute_data_pointers(struct sk_buff *skb)
{
struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
cb->data_end = skb->data + skb_headlen(skb);
cb->data_meta = skb->data - skb_metadata_len(skb);
cb->data_end = skb->data + skb_headlen(skb);
}

static inline u8 *bpf_skb_cb(struct sk_buff *skb)
Expand Down Expand Up @@ -725,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
struct bpf_prog *prog);
void xdp_do_flush_map(void);

/* Drivers not supporting XDP metadata can use this helper, which
* rejects any room expansion for metadata as a result.
*/
static __always_inline void
xdp_set_data_meta_invalid(struct xdp_buff *xdp)
{
xdp->data_meta = xdp->data + 1;
}

static __always_inline bool
xdp_data_meta_unsupported(const struct xdp_buff *xdp)
{
return unlikely(xdp->data_meta > xdp->data);
}

void bpf_warn_invalid_xdp_action(u32 act);
void bpf_warn_invalid_xdp_redirect(u32 ifindex);

struct sock *do_sk_redirect_map(void);

Expand Down
68 changes: 66 additions & 2 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
* the end of the header data, ie. at skb->end.
*/
struct skb_shared_info {
unsigned short _unused;
unsigned char nr_frags;
__u8 __unused;
__u8 meta_len;
__u8 nr_frags;
__u8 tx_flags;
unsigned short gso_size;
/* Warning: this field is not always filled in (UFO)! */
Expand Down Expand Up @@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
return 0;
}

static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
return skb_shinfo(skb)->meta_len;
}

static inline void *skb_metadata_end(const struct sk_buff *skb)
{
return skb_mac_header(skb);
}

static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
const struct sk_buff *skb_b,
u8 meta_len)
{
const void *a = skb_metadata_end(skb_a);
const void *b = skb_metadata_end(skb_b);
/* Using more efficient varaiant than plain call to memcmp(). */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
u64 diffs = 0;

switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
case 32: diffs |= __it_diff(a, b, 64);
case 24: diffs |= __it_diff(a, b, 64);
case 16: diffs |= __it_diff(a, b, 64);
case 8: diffs |= __it_diff(a, b, 64);
break;
case 28: diffs |= __it_diff(a, b, 64);
case 20: diffs |= __it_diff(a, b, 64);
case 12: diffs |= __it_diff(a, b, 64);
case 4: diffs |= __it_diff(a, b, 32);
break;
}
return diffs;
#else
return memcmp(a - meta_len, b - meta_len, meta_len);
#endif
}

static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
const struct sk_buff *skb_b)
{
u8 len_a = skb_metadata_len(skb_a);
u8 len_b = skb_metadata_len(skb_b);

if (!(len_a | len_b))
return false;

return len_a != len_b ?
true : __skb_metadata_differs(skb_a, skb_b, len_a);
}

static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
skb_shinfo(skb)->meta_len = meta_len;
}

static inline void skb_metadata_clear(struct sk_buff *skb)
{
skb_metadata_set(skb, 0);
}

struct sk_buff *skb_clone_sk(struct sk_buff *skb);

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
Expand Down
Loading

0 comments on commit 390e96e

Please sign in to comment.