Skip to content

Commit

Permalink
Merge branch 'bpf-support-for-sockets'
Browse files Browse the repository at this point in the history
David Ahern says:

====================
net: Add bpf support for sockets

The recently added VRF support in Linux leverages the bind-to-device
API for programs to specify an L3 domain for a socket. While
SO_BINDTODEVICE has been around for ages, not every ipv4/ipv6 capable
program has support for it. Even for those programs that do support it,
the API requires processes to be started as root (CAP_NET_RAW) which
is not desirable from a general security perspective.

This patch set leverages Daniel Mack's work to attach bpf programs to
a cgroup to provide a capability to set sk_bound_dev_if for all
AF_INET{6} sockets opened by a process in a cgroup when the sockets
are allocated.

For example:
 1. configure vrf (e.g., using ifupdown2)
        auto eth0
        iface eth0 inet dhcp
            vrf mgmt

        auto mgmt
        iface mgmt
            vrf-table auto

 2. configure cgroup
        mount -t cgroup2 none /tmp/cgroupv2
        mkdir /tmp/cgroupv2/mgmt
        test_cgrp2_sock /tmp/cgroupv2/mgmt 15

 3. set shell into cgroup (e.g., can be done at login using pam)
        echo $$ >> /tmp/cgroupv2/mgmt/cgroup.procs

At this point all commands run in the shell (e.g, apt) have sockets
automatically bound to the VRF (see output of ss -ap 'dev == <vrf>'),
including processes not running as root.

This capability enables running any program in a VRF context and is key
to deploying Management VRF, a fundamental configuration for networking
gear, with any Linux OS installation.

This patchset also exports the socket family, type and protocol as
read-only allowing bpf filters to deny a process in a cgroup the ability
to open specific types of AF_INET or AF_INET6 sockets.

v7
- comments from Alexei

v6
- add export of socket family, type and protocol
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Dec 2, 2016
2 parents 7f7bf16 + 554ae6e commit b5b5eca
Show file tree
Hide file tree
Showing 16 changed files with 559 additions and 46 deletions.
60 changes: 37 additions & 23 deletions include/linux/bpf-cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,31 +36,44 @@ void cgroup_bpf_update(struct cgroup *cgrp,
struct bpf_prog *prog,
enum bpf_attach_type type);

int __cgroup_bpf_run_filter(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type);

/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \
({ \
int __ret = 0; \
if (cgroup_bpf_enabled) \
__ret = __cgroup_bpf_run_filter(sk, skb, \
BPF_CGROUP_INET_INGRESS); \
\
__ret; \
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type);

int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum bpf_attach_type type);

/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \
int __ret = 0; \
if (cgroup_bpf_enabled) \
__ret = __cgroup_bpf_run_filter_skb(sk, skb, \
BPF_CGROUP_INET_INGRESS); \
\
__ret; \
})

#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) \
({ \
int __ret = 0; \
if (cgroup_bpf_enabled && sk && sk == skb->sk) { \
typeof(sk) __sk = sk_to_full_sk(sk); \
if (sk_fullsock(__sk)) \
__ret = __cgroup_bpf_run_filter(__sk, skb, \
BPF_CGROUP_INET_EGRESS); \
} \
__ret; \
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \
({ \
int __ret = 0; \
if (cgroup_bpf_enabled && sk && sk == skb->sk) { \
typeof(sk) __sk = sk_to_full_sk(sk); \
if (sk_fullsock(__sk)) \
__ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
BPF_CGROUP_INET_EGRESS); \
} \
__ret; \
})

#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
({ \
int __ret = 0; \
if (cgroup_bpf_enabled && sk) { \
__ret = __cgroup_bpf_run_filter_sk(sk, \
BPF_CGROUP_INET_SOCK_CREATE); \
} \
__ret; \
})

#else
Expand All @@ -72,6 +85,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,

#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })

#endif /* CONFIG_CGROUP_BPF */

Expand Down
15 changes: 15 additions & 0 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,21 @@ struct sock {
* Because of non atomicity rules, all
* changes are protected by socket lock.
*/
unsigned int __sk_flags_offset[0];
#ifdef __BIG_ENDIAN_BITFIELD
#define SK_FL_PROTO_SHIFT 16
#define SK_FL_PROTO_MASK 0x00ff0000

#define SK_FL_TYPE_SHIFT 0
#define SK_FL_TYPE_MASK 0x0000ffff
#else
#define SK_FL_PROTO_SHIFT 8
#define SK_FL_PROTO_MASK 0x0000ff00

#define SK_FL_TYPE_SHIFT 16
#define SK_FL_TYPE_MASK 0xffff0000
#endif

kmemcheck_bitfield_begin(flags);
unsigned int sk_padding : 2,
sk_no_check_tx : 1,
Expand Down
9 changes: 9 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
BPF_PROG_TYPE_CGROUP_SKB,
BPF_PROG_TYPE_CGROUP_SOCK,
BPF_PROG_TYPE_LWT_IN,
BPF_PROG_TYPE_LWT_OUT,
BPF_PROG_TYPE_LWT_XMIT,
Expand All @@ -109,6 +110,7 @@ enum bpf_prog_type {
enum bpf_attach_type {
BPF_CGROUP_INET_INGRESS,
BPF_CGROUP_INET_EGRESS,
BPF_CGROUP_INET_SOCK_CREATE,
__MAX_BPF_ATTACH_TYPE
};

Expand Down Expand Up @@ -567,6 +569,13 @@ enum bpf_ret_code {
/* >127 are reserved for prog type specific return codes */
};

struct bpf_sock {
__u32 bound_dev_if;
__u32 family;
__u32 type;
__u32 protocol;
};

/* User return codes for XDP prog type.
* A valid XDP program must return one of these defined values. All other
* return codes are reserved for future use. Unknown return codes will result
Expand Down
43 changes: 38 additions & 5 deletions kernel/bpf/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
}

/**
* __cgroup_bpf_run_filter() - Run a program for packet filtering
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socken sending or receiving traffic
* @skb: The skb that is being sent or received
* @type: The type of program to be exectuted
Expand All @@ -132,9 +132,9 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
* This function will return %-EPERM if any if an attached program was found
* and if it returned != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type)
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type)
{
struct bpf_prog *prog;
struct cgroup *cgrp;
Expand Down Expand Up @@ -164,4 +164,37 @@ int __cgroup_bpf_run_filter(struct sock *sk,

return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter);
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);

/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
* @type: The type of program to be exectuted
*
* socket is passed is expected to be of type INET or INET6.
*
* The program type passed in via @type must be suitable for sock
* filtering. No further check is performed to assert that.
*
* This function will return %-EPERM if any if an attached program was found
* and if it returned != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
struct bpf_prog *prog;
int ret = 0;


rcu_read_lock();

prog = rcu_dereference(cgrp->bpf.effective[type]);
if (prog)
ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;

rcu_read_unlock();

return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
33 changes: 19 additions & 14 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -856,6 +856,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
{
struct bpf_prog *prog;
struct cgroup *cgrp;
enum bpf_prog_type ptype;

if (!capable(CAP_NET_ADMIN))
return -EPERM;
Expand All @@ -866,25 +867,28 @@ static int bpf_prog_attach(const union bpf_attr *attr)
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
prog = bpf_prog_get_type(attr->attach_bpf_fd,
BPF_PROG_TYPE_CGROUP_SKB);
if (IS_ERR(prog))
return PTR_ERR(prog);

cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp)) {
bpf_prog_put(prog);
return PTR_ERR(cgrp);
}

cgroup_bpf_update(cgrp, prog, attr->attach_type);
cgroup_put(cgrp);
ptype = BPF_PROG_TYPE_CGROUP_SKB;
break;
case BPF_CGROUP_INET_SOCK_CREATE:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;

default:
return -EINVAL;
}

prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
return PTR_ERR(prog);

cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp)) {
bpf_prog_put(prog);
return PTR_ERR(cgrp);
}

cgroup_bpf_update(cgrp, prog, attr->attach_type);
cgroup_put(cgrp);

return 0;
}

Expand All @@ -903,6 +907,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
Expand Down
83 changes: 83 additions & 0 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -2818,6 +2818,32 @@ static bool lwt_is_valid_access(int off, int size,
return __is_valid_access(off, size, type);
}

static bool sock_filter_is_valid_access(int off, int size,
enum bpf_access_type type,
enum bpf_reg_type *reg_type)
{
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct bpf_sock, bound_dev_if):
break;
default:
return false;
}
}

if (off < 0 || off + size > sizeof(struct bpf_sock))
return false;

/* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;

if (size != sizeof(__u32))
return false;

return true;
}

static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog)
{
Expand Down Expand Up @@ -3076,6 +3102,51 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
return insn - insn_buf;
}

static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
int dst_reg, int src_reg,
int ctx_off,
struct bpf_insn *insn_buf,
struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;

switch (ctx_off) {
case offsetof(struct bpf_sock, bound_dev_if):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);

if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
offsetof(struct sock, sk_bound_dev_if));
else
*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
offsetof(struct sock, sk_bound_dev_if));
break;

case offsetof(struct bpf_sock, family):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);

*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
offsetof(struct sock, sk_family));
break;

case offsetof(struct bpf_sock, type):
*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
offsetof(struct sock, __sk_flags_offset));
*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
break;

case offsetof(struct bpf_sock, protocol):
*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
offsetof(struct sock, __sk_flags_offset));
*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
break;
}

return insn - insn_buf;
}

static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
Expand Down Expand Up @@ -3162,6 +3233,12 @@ static const struct bpf_verifier_ops lwt_xmit_ops = {
.gen_prologue = tc_cls_act_prologue,
};

static const struct bpf_verifier_ops cg_sock_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sock_filter_is_valid_access,
.convert_ctx_access = sock_filter_convert_ctx_access,
};

static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
Expand Down Expand Up @@ -3202,13 +3279,19 @@ static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
.type = BPF_PROG_TYPE_LWT_XMIT,
};

static struct bpf_prog_type_list cg_sock_type __read_mostly = {
.ops = &cg_sock_ops,
.type = BPF_PROG_TYPE_CGROUP_SOCK
};

static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
bpf_register_prog_type(&cg_skb_type);
bpf_register_prog_type(&cg_sock_type);
bpf_register_prog_type(&lwt_in_type);
bpf_register_prog_type(&lwt_out_type);
bpf_register_prog_type(&lwt_xmit_type);
Expand Down
12 changes: 11 additions & 1 deletion net/ipv4/af_inet.c
Original file line number Diff line number Diff line change
Expand Up @@ -374,8 +374,18 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,

if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
if (err) {
sk_common_release(sk);
goto out;
}
}

if (!kern) {
err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
out:
return err;
Expand Down
Loading

0 comments on commit b5b5eca

Please sign in to comment.