Skip to content

Commit

Permalink
tun: add eBPF based queue selection method
Browse files Browse the repository at this point in the history
This patch introduces an eBPF based queue selection method. With this,
the policy could be offloaded to userspace completely through a new
ioctl TUNSETSTEERINGEBPF.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Jason Wang authored and David S. Miller committed Dec 5, 2017
1 parent f520957 commit 96f8406
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 23 deletions.
145 changes: 122 additions & 23 deletions drivers/net/tun.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,11 @@ struct tun_flow_entry {

#define TUN_NUM_FLOW_ENTRIES 1024

struct tun_steering_prog {
struct rcu_head rcu;
struct bpf_prog *prog;
};

/* Since the socket were moved to tun_file, to preserve the behavior of persist
* device, socket filter, sndbuf and vnet header size were restore when the
* file were attached to a persist device.
Expand Down Expand Up @@ -232,6 +237,7 @@ struct tun_struct {
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog;
struct tun_steering_prog __rcu *steering_prog;
};

static int tun_napi_receive(struct napi_struct *napi, int budget)
Expand Down Expand Up @@ -537,15 +543,12 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
* different rxq no. here. If we could not get rxhash, then we would
* hope the rxq no. may help here.
*/
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
void *accel_priv, select_queue_fallback_t fallback)
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
struct tun_struct *tun = netdev_priv(dev);
struct tun_flow_entry *e;
u32 txq = 0;
u32 numqueues = 0;

rcu_read_lock();
numqueues = READ_ONCE(tun->numqueues);

txq = __skb_get_hash_symmetric(skb);
Expand All @@ -563,10 +566,37 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
txq -= numqueues;
}

rcu_read_unlock();
return txq;
}

static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
struct tun_steering_prog *prog;
u16 ret = 0;

prog = rcu_dereference(tun->steering_prog);
if (prog)
ret = bpf_prog_run_clear_cb(prog->prog, skb);

return ret % tun->numqueues;
}

static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
void *accel_priv, select_queue_fallback_t fallback)
{
struct tun_struct *tun = netdev_priv(dev);
u16 ret;

rcu_read_lock();
if (rcu_dereference(tun->steering_prog))
ret = tun_ebpf_select_queue(tun, skb);
else
ret = tun_automq_select_queue(tun, skb);
rcu_read_unlock();

return ret;
}

static inline bool tun_not_capable(struct tun_struct *tun)
{
const struct cred *cred = current_cred();
Expand Down Expand Up @@ -933,23 +963,10 @@ static int tun_net_close(struct net_device *dev)
}

/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
u32 numqueues = 0;

rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
numqueues = READ_ONCE(tun->numqueues);

/* Drop packet if interface is not attached */
if (txq >= numqueues)
goto drop;

#ifdef CONFIG_RPS
if (numqueues == 1 && static_key_false(&rps_needed)) {
if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
/* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here.
*/
Expand All @@ -965,6 +982,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
}
}
#endif
}

/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
u32 numqueues = 0;

rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
numqueues = READ_ONCE(tun->numqueues);

/* Drop packet if interface is not attached */
if (txq >= numqueues)
goto drop;

if (!rcu_dereference(tun->steering_prog))
tun_automq_xmit(tun, skb);

tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);

Expand Down Expand Up @@ -1547,7 +1584,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
int copylen;
bool zerocopy = false;
int err;
u32 rxhash;
u32 rxhash = 0;
int skb_xdp = 1;
bool frags = tun_napi_frags_enabled(tun);

Expand Down Expand Up @@ -1735,7 +1772,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
rcu_read_unlock();
}

rxhash = __skb_get_hash_symmetric(skb);
rcu_read_lock();
if (!rcu_dereference(tun->steering_prog))
rxhash = __skb_get_hash_symmetric(skb);
rcu_read_unlock();

if (frags) {
/* Exercise flow dissector code path. */
Expand Down Expand Up @@ -1779,7 +1819,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
u64_stats_update_end(&stats->syncp);
put_cpu_ptr(stats);

tun_flow_update(tun, rxhash, tfile);
if (rxhash)
tun_flow_update(tun, rxhash, tfile);

return total_len;
}

Expand Down Expand Up @@ -1987,6 +2029,36 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}

static void tun_steering_prog_free(struct rcu_head *rcu)
{
struct tun_steering_prog *prog = container_of(rcu,
struct tun_steering_prog, rcu);

bpf_prog_destroy(prog->prog);
kfree(prog);
}

static int __tun_set_steering_ebpf(struct tun_struct *tun,
struct bpf_prog *prog)
{
struct tun_steering_prog *old, *new = NULL;

if (prog) {
new = kmalloc(sizeof(*new), GFP_KERNEL);
if (!new)
return -ENOMEM;
new->prog = prog;
}

old = rtnl_dereference(tun->steering_prog);
rcu_assign_pointer(tun->steering_prog, new);

if (old)
call_rcu(&old->rcu, tun_steering_prog_free);

return 0;
}

static void tun_free_netdev(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
Expand All @@ -1995,6 +2067,9 @@ static void tun_free_netdev(struct net_device *dev)
free_percpu(tun->pcpu_stats);
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
rtnl_lock();
__tun_set_steering_ebpf(tun, NULL);
rtnl_unlock();
}

static void tun_setup(struct net_device *dev)
Expand Down Expand Up @@ -2283,6 +2358,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
tun->rx_batched = 0;
RCU_INIT_POINTER(tun->steering_prog, NULL);

tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
if (!tun->pcpu_stats) {
Expand Down Expand Up @@ -2475,6 +2551,25 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
return ret;
}

static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
{
struct bpf_prog *prog;
int fd;

if (copy_from_user(&fd, data, sizeof(fd)))
return -EFAULT;

if (fd == -1) {
prog = NULL;
} else {
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
if (IS_ERR(prog))
return PTR_ERR(prog);
}

return __tun_set_steering_ebpf(tun, prog);
}

static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg, int ifreq_len)
{
Expand Down Expand Up @@ -2751,6 +2846,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = 0;
break;

case TUNSETSTEERINGEBPF:
ret = tun_set_steering_ebpf(tun, argp);
break;

default:
ret = -EINVAL;
break;
Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/if_tun.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
*/
#define TUNSETVNETBE _IOW('T', 222, int)
#define TUNGETVNETBE _IOR('T', 223, int)
#define TUNSETSTEERINGEBPF _IOR('T', 224, int)

/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
Expand Down

0 comments on commit 96f8406

Please sign in to comment.