Skip to content

Commit

Permalink
Merge branch 'fix-bpf_redirect'
Browse files Browse the repository at this point in the history
Martin KaFai Lau says:

====================
bpf: Fix bpf_redirect to an ipip/ip6tnl dev

This patch set fixes a bug in bpf_redirect(dev, flags) when dev is an
ipip/ip6tnl.  The current problem is IP-EthHdr-IP is sent out instead of
IP-IP.

Patch 1 adds a dev->type test similar to dev_is_mac_header_xmit()
in act_mirred.c which is only available in net-next.  We can consider to
refactor it once this patch is pulled into net-next from net.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Nov 13, 2016
2 parents 23dd831 + 90e0289 commit 79774d6
Show file tree
Hide file tree
Showing 7 changed files with 567 additions and 19 deletions.
15 changes: 15 additions & 0 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -3354,6 +3354,21 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb);

static __always_inline int ____dev_forward_skb(struct net_device *dev,
struct sk_buff *skb)
{
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}

skb_scrub_packet(skb, true);
skb->priority = 0;
return 0;
}

void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);

extern int netdev_budget;
Expand Down
17 changes: 6 additions & 11 deletions net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -1766,19 +1766,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
int ret = ____dev_forward_skb(dev, skb);

skb_scrub_packet(skb, true);
skb->priority = 0;
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}

return 0;
return ret;
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);

Expand Down
68 changes: 60 additions & 8 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
return dev_forward_skb(dev, skb);
}

static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
struct sk_buff *skb)
{
int ret = ____dev_forward_skb(dev, skb);

if (likely(!ret)) {
skb->dev = dev;
ret = netif_rx(skb);
}

return ret;
}

static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret;
Expand All @@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
return ret;
}

static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
/* skb->mac_len is not set on normal egress */
unsigned int mlen = skb->network_header - skb->mac_header;

__skb_pull(skb, mlen);

/* At ingress, the mac header has already been pulled once.
* At egress, skb_pospull_rcsum has to be done in case that
* the skb is originated from ingress (i.e. a forwarded skb)
* to ensure that rcsum starts at net header.
*/
if (!skb_at_tc_ingress(skb))
skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
skb_pop_mac_header(skb);
skb_reset_mac_len(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
bpf_push_mac_rcsum(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
switch (dev->type) {
case ARPHRD_TUNNEL:
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
return __bpf_redirect_no_mac(skb, dev, flags);
default:
return __bpf_redirect_common(skb, dev, flags);
}
}

BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
struct net_device *dev;
Expand Down Expand Up @@ -1675,10 +1733,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
return -ENOMEM;
}

bpf_push_mac_rcsum(clone);

return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
return __bpf_redirect(clone, dev, flags);
}

static const struct bpf_func_proto bpf_clone_redirect_proto = {
Expand Down Expand Up @@ -1722,10 +1777,7 @@ int skb_do_redirect(struct sk_buff *skb)
return -EINVAL;
}

bpf_push_mac_rcsum(skb);

return ri->flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
return __bpf_redirect(skb, dev, ri->flags);
}

static const struct bpf_func_proto bpf_redirect_proto = {
Expand Down
4 changes: 4 additions & 0 deletions samples/bpf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ hostprogs-y += xdp2
hostprogs-y += test_current_task_under_cgroup
hostprogs-y += trace_event
hostprogs-y += sampleip
hostprogs-y += tc_l2_redirect

test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
Expand Down Expand Up @@ -56,6 +57,7 @@ test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
test_current_task_under_cgroup_user.o
trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o

# Tell kbuild to always build the programs
always := $(hostprogs-y)
Expand All @@ -72,6 +74,7 @@ always += test_probe_write_user_kern.o
always += trace_output_kern.o
always += tcbpf1_kern.o
always += tcbpf2_kern.o
always += tc_l2_redirect_kern.o
always += lathist_kern.o
always += offwaketime_kern.o
always += spintest_kern.o
Expand Down Expand Up @@ -111,6 +114,7 @@ HOSTLOADLIBES_xdp2 += -lelf
HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
HOSTLOADLIBES_trace_event += -lelf
HOSTLOADLIBES_sampleip += -lelf
HOSTLOADLIBES_tc_l2_redirect += -l elf

# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
Expand Down
173 changes: 173 additions & 0 deletions samples/bpf/tc_l2_redirect.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#!/bin/bash

[[ -z $TC ]] && TC='tc'
[[ -z $IP ]] && IP='ip'

REDIRECT_USER='./tc_l2_redirect'
REDIRECT_BPF='./tc_l2_redirect_kern.o'

RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter)
IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding)

function config_common {
local tun_type=$1

$IP netns add ns1
$IP netns add ns2
$IP link add ve1 type veth peer name vens1
$IP link add ve2 type veth peer name vens2
$IP link set dev ve1 up
$IP link set dev ve2 up
$IP link set dev ve1 mtu 1500
$IP link set dev ve2 mtu 1500
$IP link set dev vens1 netns ns1
$IP link set dev vens2 netns ns2

$IP -n ns1 link set dev lo up
$IP -n ns1 link set dev vens1 up
$IP -n ns1 addr add 10.1.1.101/24 dev vens1
$IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad
$IP -n ns1 route add default via 10.1.1.1 dev vens1
$IP -n ns1 route add default via 2401:db01::1 dev vens1

$IP -n ns2 link set dev lo up
$IP -n ns2 link set dev vens2 up
$IP -n ns2 addr add 10.2.1.102/24 dev vens2
$IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad
$IP -n ns2 addr add 10.10.1.102 dev lo
$IP -n ns2 addr add 2401:face::66/64 dev lo nodad
$IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1
$IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1
$IP -n ns2 link set dev ipt2 up
$IP -n ns2 link set dev ip6t2 up
$IP netns exec ns2 $TC qdisc add dev vens2 clsact
$IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip
if [[ $tun_type == "ipip" ]]; then
$IP -n ns2 route add 10.1.1.0/24 dev ipt2
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0
else
$IP -n ns2 route add 10.1.1.0/24 dev ip6t2
$IP -n ns2 route add 2401:db01::/64 dev ip6t2
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0
fi

$IP addr add 10.1.1.1/24 dev ve1
$IP addr add 2401:db01::1/64 dev ve1 nodad
$IP addr add 10.2.1.1/24 dev ve2
$IP addr add 2401:db02::1/64 dev ve2 nodad

$TC qdisc add dev ve2 clsact
$TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward

sysctl -q -w net.ipv4.conf.all.rp_filter=0
sysctl -q -w net.ipv6.conf.all.forwarding=1
}

function cleanup {
set +e
[[ -z $DEBUG ]] || set +x
$IP netns delete ns1 >& /dev/null
$IP netns delete ns2 >& /dev/null
$IP link del ve1 >& /dev/null
$IP link del ve2 >& /dev/null
$IP link del ipt >& /dev/null
$IP link del ip6t >& /dev/null
sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER
sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING
rm -f /sys/fs/bpf/tc/globals/tun_iface
[[ -z $DEBUG ]] || set -x
set -e
}

function l2_to_ipip {
echo -n "l2_to_ipip $1: "

local dir=$1

config_common ipip

$IP link add ipt type ipip external
$IP link set dev ipt up
sysctl -q -w net.ipv4.conf.ipt.rp_filter=0
sysctl -q -w net.ipv4.conf.ipt.forwarding=1

if [[ $dir == "egress" ]]; then
$IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
$TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
sysctl -q -w net.ipv4.conf.ve1.forwarding=1
else
$TC qdisc add dev ve1 clsact
$TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
fi

$REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex)

$IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null

if [[ $dir == "egress" ]]; then
# test direct egress to ve2 (i.e. not forwarding from
# ve1 to ve2).
ping -c1 10.10.1.102 >& /dev/null
fi

cleanup

echo "OK"
}

function l2_to_ip6tnl {
echo -n "l2_to_ip6tnl $1: "

local dir=$1

config_common ip6tnl

$IP link add ip6t type ip6tnl mode any external
$IP link set dev ip6t up
sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0
sysctl -q -w net.ipv4.conf.ip6t.forwarding=1

if [[ $dir == "egress" ]]; then
$IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
$IP route add 2401:face::/64 via 2401:db02::66 dev ve2
$TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
sysctl -q -w net.ipv4.conf.ve1.forwarding=1
else
$TC qdisc add dev ve1 clsact
$TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
fi

$REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex)

$IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
$IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null

if [[ $dir == "egress" ]]; then
# test direct egress to ve2 (i.e. not forwarding from
# ve1 to ve2).
ping -c1 10.10.1.102 >& /dev/null
ping -6 -c1 2401:face::66 >& /dev/null
fi

cleanup

echo "OK"
}

cleanup
test_names="l2_to_ipip l2_to_ip6tnl"
test_dirs="ingress egress"
if [[ $# -ge 2 ]]; then
test_names=$1
test_dirs=$2
elif [[ $# -ge 1 ]]; then
test_names=$1
fi

for t in $test_names; do
for d in $test_dirs; do
$t $d
done
done
Loading

0 comments on commit 79774d6

Please sign in to comment.