Skip to content

Commit

Permalink
Merge branch 'bpf-array-mmap'
Browse files Browse the repository at this point in the history
Andrii Nakryiko says:

====================
This patch set adds ability to memory-map BPF array maps (single- and
multi-element). The primary use case is memory-mapping BPF array maps, created
to back global data variables, created by libbpf implicitly. This allows for
much better usability, along with avoiding syscalls to read or update data
completely.

Due to memory-mapping requirements, BPF array map that is supposed to be
memory-mapped, has to be created with special BPF_F_MMAPABLE attribute, which
triggers slightly different memory allocation strategy internally. See
patch 1 for details.

Libbpf is extended to detect kernel support for this flag, and if supported,
will specify it for all global data maps automatically.

Patch #1 refactors bpf_map_inc() and converts bpf_map's refcnt to atomic64_t
to make refcounting never fail. Patch #2 does similar refactoring for
bpf_prog_add()/bpf_prog_inc().

v5->v6:
- add back uref counting (Daniel);

v4->v5:
- change bpf_prog's refcnt to atomic64_t (Daniel);

v3->v4:
- add mmap's open() callback to fix refcounting (Johannes);
- switch to remap_vmalloc_pages() instead of custom fault handler (Johannes);
- converted bpf_map's refcnt/usercnt into atomic64_t;
- provide default bpf_map_default_vmops handling open/close properly;

v2->v3:
- change allocation strategy to avoid extra pointer dereference (Jakub);

v1->v2:
- fix map lookup code generation for BPF_F_MMAPABLE case;
- prevent BPF_F_MMAPABLE flag for all but plain array map type;
- centralize ref-counting in generic bpf_map_mmap();
- don't use uref counting (Alexei);
- use vfree() directly;
- print flags with %x (Song);
- extend tests to verify bpf_map_{lookup,update}_elem() logic as well.
====================

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
  • Loading branch information
Daniel Borkmann committed Nov 18, 2019
2 parents 2893c99 + 5051b38 commit b97e12e
Show file tree
Hide file tree
Showing 25 changed files with 576 additions and 175 deletions.
9 changes: 2 additions & 7 deletions drivers/net/ethernet/broadcom/bnxt/bnxt.c
Original file line number Diff line number Diff line change
Expand Up @@ -3171,13 +3171,8 @@ static int bnxt_init_one_rx_ring(struct bnxt *bp, int ring_nr)
bnxt_init_rxbd_pages(ring, type);

if (BNXT_RX_PAGE_MODE(bp) && bp->xdp_prog) {
rxr->xdp_prog = bpf_prog_add(bp->xdp_prog, 1);
if (IS_ERR(rxr->xdp_prog)) {
int rc = PTR_ERR(rxr->xdp_prog);

rxr->xdp_prog = NULL;
return rc;
}
bpf_prog_add(bp->xdp_prog, 1);
rxr->xdp_prog = bp->xdp_prog;
}
prod = rxr->rx_prod;
for (i = 0; i < bp->rx_ring_size; i++) {
Expand Down
9 changes: 2 additions & 7 deletions drivers/net/ethernet/cavium/thunder/nicvf_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1876,13 +1876,8 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog)

if (nic->xdp_prog) {
/* Attach BPF program */
nic->xdp_prog = bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1);
if (!IS_ERR(nic->xdp_prog)) {
bpf_attached = true;
} else {
ret = PTR_ERR(nic->xdp_prog);
nic->xdp_prog = NULL;
}
bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1);
bpf_attached = true;
}

/* Calculate Tx queues needed for XDP and network stack */
Expand Down
7 changes: 2 additions & 5 deletions drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
Original file line number Diff line number Diff line change
Expand Up @@ -1807,11 +1807,8 @@ static int setup_xdp(struct net_device *dev, struct bpf_prog *prog)
if (prog && !xdp_mtu_valid(priv, dev->mtu))
return -EINVAL;

if (prog) {
prog = bpf_prog_add(prog, priv->num_channels);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
if (prog)
bpf_prog_add(prog, priv->num_channels);

up = netif_running(dev);
need_update = (!!priv->xdp_prog != !!prog);
Expand Down
24 changes: 6 additions & 18 deletions drivers/net/ethernet/mellanox/mlx4/en_netdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -2286,11 +2286,7 @@ int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
lockdep_is_held(&priv->mdev->state_lock));

if (xdp_prog && carry_xdp_prog) {
xdp_prog = bpf_prog_add(xdp_prog, tmp->rx_ring_num);
if (IS_ERR(xdp_prog)) {
mlx4_en_free_resources(tmp);
return PTR_ERR(xdp_prog);
}
bpf_prog_add(xdp_prog, tmp->rx_ring_num);
for (i = 0; i < tmp->rx_ring_num; i++)
rcu_assign_pointer(tmp->rx_ring[i]->xdp_prog,
xdp_prog);
Expand Down Expand Up @@ -2782,11 +2778,9 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
* program for a new one.
*/
if (priv->tx_ring_num[TX_XDP] == xdp_ring_num) {
if (prog) {
prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
if (prog)
bpf_prog_add(prog, priv->rx_ring_num - 1);

mutex_lock(&mdev->state_lock);
for (i = 0; i < priv->rx_ring_num; i++) {
old_prog = rcu_dereference_protected(
Expand All @@ -2807,13 +2801,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
if (!tmp)
return -ENOMEM;

if (prog) {
prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
if (IS_ERR(prog)) {
err = PTR_ERR(prog);
goto out;
}
}
if (prog)
bpf_prog_add(prog, priv->rx_ring_num - 1);

mutex_lock(&mdev->state_lock);
memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
Expand Down Expand Up @@ -2862,7 +2851,6 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)

unlock_out:
mutex_unlock(&mdev->state_lock);
out:
kfree(tmp);
return err;
}
Expand Down
18 changes: 5 additions & 13 deletions drivers/net/ethernet/mellanox/mlx5/core/en_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -408,12 +408,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->stats = &c->priv->channel_stats[c->ix].rq;
INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work);

rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
if (IS_ERR(rq->xdp_prog)) {
err = PTR_ERR(rq->xdp_prog);
rq->xdp_prog = NULL;
goto err_rq_wq_destroy;
}
if (params->xdp_prog)
bpf_prog_inc(params->xdp_prog);
rq->xdp_prog = params->xdp_prog;

rq_xdp_ix = rq->ix;
if (xsk)
Expand Down Expand Up @@ -4406,16 +4403,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
/* no need for full reset when exchanging programs */
reset = (!priv->channels.params.xdp_prog || !prog);

if (was_opened && !reset) {
if (was_opened && !reset)
/* num_channels is invariant here, so we can take the
* batched reference right upfront.
*/
prog = bpf_prog_add(prog, priv->channels.num);
if (IS_ERR(prog)) {
err = PTR_ERR(prog);
goto unlock;
}
}
bpf_prog_add(prog, priv->channels.num);

if (was_opened && reset) {
struct mlx5e_channels new_channels = {};
Expand Down
4 changes: 1 addition & 3 deletions drivers/net/ethernet/netronome/nfp/bpf/offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
/* Grab a single ref to the map for our record. The prog destroy ndo
* happens after free_used_maps().
*/
map = bpf_map_inc(map, false);
if (IS_ERR(map))
return PTR_ERR(map);
bpf_map_inc(map);

record = kmalloc(sizeof(*record), GFP_KERNEL);
if (!record) {
Expand Down
8 changes: 2 additions & 6 deletions drivers/net/ethernet/qlogic/qede/qede_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2107,12 +2107,8 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
if (rc)
goto out;

fp->rxq->xdp_prog = bpf_prog_add(edev->xdp_prog, 1);
if (IS_ERR(fp->rxq->xdp_prog)) {
rc = PTR_ERR(fp->rxq->xdp_prog);
fp->rxq->xdp_prog = NULL;
goto out;
}
bpf_prog_add(edev->xdp_prog, 1);
fp->rxq->xdp_prog = edev->xdp_prog;
}

if (fp->type & QEDE_FASTPATH_TX) {
Expand Down
7 changes: 2 additions & 5 deletions drivers/net/virtio_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -2445,11 +2445,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
if (!prog && !old_prog)
return 0;

if (prog) {
prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
if (prog)
bpf_prog_add(prog, vi->max_queue_pairs - 1);

/* Make sure NAPI is not using any XDP TX queues for RX. */
if (netif_running(dev)) {
Expand Down
34 changes: 18 additions & 16 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <linux/err.h>
#include <linux/rbtree_latch.h>
#include <linux/numa.h>
#include <linux/mm_types.h>
#include <linux/wait.h>
#include <linux/u64_stats_sync.h>
#include <linux/refcount.h>
Expand Down Expand Up @@ -68,6 +69,7 @@ struct bpf_map_ops {
u64 *imm, u32 off);
int (*map_direct_value_meta)(const struct bpf_map *map,
u64 imm, u32 *off);
int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
};

struct bpf_map_memory {
Expand Down Expand Up @@ -96,17 +98,19 @@ struct bpf_map {
u32 btf_value_type_id;
struct btf *btf;
struct bpf_map_memory memory;
char name[BPF_OBJ_NAME_LEN];
bool unpriv_array;
bool frozen; /* write-once */
/* 48 bytes hole */
bool frozen; /* write-once; write-protected by freeze_mutex */
/* 22 bytes hole */

/* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting.
*/
atomic_t refcnt ____cacheline_aligned;
atomic_t usercnt;
atomic64_t refcnt ____cacheline_aligned;
atomic64_t usercnt;
struct work_struct work;
char name[BPF_OBJ_NAME_LEN];
struct mutex freeze_mutex;
u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
};

static inline bool map_value_has_spin_lock(const struct bpf_map *map)
Expand Down Expand Up @@ -485,7 +489,7 @@ struct bpf_func_info_aux {
};

struct bpf_prog_aux {
atomic_t refcnt;
atomic64_t refcnt;
u32 used_map_cnt;
u32 max_ctx_offset;
u32 max_pkt_offset;
Expand Down Expand Up @@ -770,9 +774,9 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
struct bpf_prog *bpf_prog_get(u32 ufd);
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
bool attach_drv);
struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
void bpf_prog_add(struct bpf_prog *prog, int i);
void bpf_prog_sub(struct bpf_prog *prog, int i);
struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
void bpf_prog_inc(struct bpf_prog *prog);
struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
void bpf_prog_put(struct bpf_prog *prog);
int __bpf_prog_charge(struct user_struct *user, u32 pages);
Expand All @@ -783,9 +787,9 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);

struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map,
bool uref);
void bpf_map_inc(struct bpf_map *map);
void bpf_map_inc_with_uref(struct bpf_map *map);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
Expand All @@ -795,6 +799,7 @@ void bpf_map_charge_finish(struct bpf_map_memory *mem);
void bpf_map_charge_move(struct bpf_map_memory *dst,
struct bpf_map_memory *src);
void *bpf_map_area_alloc(size_t size, int numa_node);
void *bpf_map_area_mmapable_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);

Expand Down Expand Up @@ -912,10 +917,8 @@ static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
return ERR_PTR(-EOPNOTSUPP);
}

static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
int i)
static inline void bpf_prog_add(struct bpf_prog *prog, int i)
{
return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
Expand All @@ -926,9 +929,8 @@ static inline void bpf_prog_put(struct bpf_prog *prog)
{
}

static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog)
static inline void bpf_prog_inc(struct bpf_prog *prog)
{
return ERR_PTR(-EOPNOTSUPP);
}

static inline struct bpf_prog *__must_check
Expand Down
1 change: 1 addition & 0 deletions include/linux/vmalloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ extern void *vzalloc(unsigned long size);
extern void *vmalloc_user(unsigned long size);
extern void *vmalloc_node(unsigned long size, int node);
extern void *vzalloc_node(unsigned long size, int node);
extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags);
extern void *vmalloc_exec(unsigned long size);
extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size);
Expand Down
3 changes: 3 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ enum bpf_attach_type {
/* Clone map from listener for newly accepted socket */
#define BPF_F_CLONE (1U << 9)

/* Enable memory-mapping BPF map */
#define BPF_F_MMAPABLE (1U << 10)

/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)

Expand Down
Loading

0 comments on commit b97e12e

Please sign in to comment.