Skip to content

Commit

Permalink
Merge branch 'vxlan-fix-npds-when-using-nexthop-objects'
Browse files Browse the repository at this point in the history
Ido Schimmel says:

====================
vxlan: Fix NPDs when using nexthop objects

With FDB nexthop groups, VXLAN FDB entries do not necessarily point to
a remote destination but rather to an FDB nexthop group. This means that
first_remote_{rcu,rtnl}() can return NULL and a few places in the driver
were not ready for that, resulting in NULL pointer dereferences.
Patches #1-#2 fix these NPDs.

Note that vxlan_fdb_find_uc() still dereferences the remote returned by
first_remote_rcu() without checking that it is not NULL, but this
function is only invoked by a single driver which vetoes the creation of
FDB nexthop groups. I will patch this in net-next to make the code less
fragile.

Patch #3 adds a selftests which exercises these code paths and tests
basic Tx functionality with FDB nexthop groups. I verified that the test
crashes the kernel without the first two patches.
====================

Link: https://patch.msgid.link/20250901065035.159644-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Sep 2, 2025
2 parents a7195a3 + 2c9fb92 commit 41ec374
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 9 deletions.
18 changes: 12 additions & 6 deletions drivers/net/vxlan/vxlan_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1445,6 +1445,10 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev,
if (READ_ONCE(f->updated) != now)
WRITE_ONCE(f->updated, now);

/* Don't override an fdb with nexthop with a learnt entry */
if (rcu_access_pointer(f->nh))
return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;

if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
rdst->remote_ifindex == ifindex))
return SKB_NOT_DROPPED_YET;
Expand All @@ -1453,10 +1457,6 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev,
if (f->state & (NUD_PERMANENT | NUD_NOARP))
return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;

/* Don't override an fdb with nexthop with a learnt entry */
if (rcu_access_pointer(f->nh))
return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;

if (net_ratelimit())
netdev_info(dev,
"%pM migrated from %pIS to %pIS\n",
Expand Down Expand Up @@ -1877,6 +1877,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
n = neigh_lookup(&arp_tbl, &tip, dev);

if (n) {
struct vxlan_rdst *rdst = NULL;
struct vxlan_fdb *f;
struct sk_buff *reply;

Expand All @@ -1887,7 +1888,9 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)

rcu_read_lock();
f = vxlan_find_mac_tx(vxlan, n->ha, vni);
if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
if (f)
rdst = first_remote_rcu(f);
if (rdst && vxlan_addr_any(&rdst->remote_ip)) {
/* bridge-local neighbor */
neigh_release(n);
rcu_read_unlock();
Expand Down Expand Up @@ -2044,6 +2047,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);

if (n) {
struct vxlan_rdst *rdst = NULL;
struct vxlan_fdb *f;
struct sk_buff *reply;

Expand All @@ -2053,7 +2057,9 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
}

f = vxlan_find_mac_tx(vxlan, n->ha, vni);
if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
if (f)
rdst = first_remote_rcu(f);
if (rdst && vxlan_addr_any(&rdst->remote_ip)) {
/* bridge-local neighbor */
neigh_release(n);
goto out;
Expand Down
4 changes: 1 addition & 3 deletions drivers/net/vxlan/vxlan_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

/* First remote destination for a forwarding entry.
* Guaranteed to be non-NULL because remotes are never deleted.
*/
/* First remote destination for a forwarding entry. */
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
{
if (rcu_access_pointer(fdb->nh))
Expand Down
1 change: 1 addition & 0 deletions tools/testing/selftests/net/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ TEST_GEN_PROGS += bind_wildcard
TEST_GEN_PROGS += bind_timewait
TEST_PROGS += test_vxlan_mdb.sh
TEST_PROGS += test_bridge_neigh_suppress.sh
TEST_PROGS += test_vxlan_nh.sh
TEST_PROGS += test_vxlan_nolocalbypass.sh
TEST_PROGS += test_bridge_backup_port.sh
TEST_PROGS += test_neigh.sh
Expand Down
223 changes: 223 additions & 0 deletions tools/testing/selftests/net/test_vxlan_nh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0

source lib.sh
TESTS="
basic_tx_ipv4
basic_tx_ipv6
learning
proxy_ipv4
proxy_ipv6
"
VERBOSE=0

################################################################################
# Utilities

run_cmd()
{
local cmd="$1"
local out
local stderr="2>/dev/null"

if [ "$VERBOSE" = "1" ]; then
echo "COMMAND: $cmd"
stderr=
fi

out=$(eval "$cmd" "$stderr")
rc=$?
if [ "$VERBOSE" -eq 1 ] && [ -n "$out" ]; then
echo " $out"
fi

return $rc
}

################################################################################
# Cleanup

exit_cleanup_all()
{
cleanup_all_ns
exit "${EXIT_STATUS}"
}

################################################################################
# Tests

nh_stats_get()
{
ip -n "$ns1" -s -j nexthop show id 10 | jq ".[][\"group_stats\"][][\"packets\"]"
}

tc_stats_get()
{
tc_rule_handle_stats_get "dev dummy1 egress" 101 ".packets" "-n $ns1"
}

basic_tx_common()
{
local af_str=$1; shift
local proto=$1; shift
local local_addr=$1; shift
local plen=$1; shift
local remote_addr=$1; shift

RET=0

# Test basic Tx functionality. Check that stats are incremented on
# both the FDB nexthop group and the egress device.

run_cmd "ip -n $ns1 link add name dummy1 up type dummy"
run_cmd "ip -n $ns1 route add $remote_addr/$plen dev dummy1"
run_cmd "tc -n $ns1 qdisc add dev dummy1 clsact"
run_cmd "tc -n $ns1 filter add dev dummy1 egress proto $proto pref 1 handle 101 flower ip_proto udp dst_ip $remote_addr dst_port 4789 action pass"

run_cmd "ip -n $ns1 address add $local_addr/$plen dev lo"

run_cmd "ip -n $ns1 nexthop add id 1 via $remote_addr fdb"
run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb"

run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local $local_addr dstport 4789"
run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static nhid 10"

run_cmd "ip netns exec $ns1 mausezahn vx0 -a own -b 00:11:22:33:44:55 -c 1 -q"

busywait "$BUSYWAIT_TIMEOUT" until_counter_is "== 1" nh_stats_get > /dev/null
check_err $? "FDB nexthop group stats did not increase"

busywait "$BUSYWAIT_TIMEOUT" until_counter_is "== 1" tc_stats_get > /dev/null
check_err $? "tc filter stats did not increase"

log_test "VXLAN FDB nexthop: $af_str basic Tx"
}

basic_tx_ipv4()
{
basic_tx_common "IPv4" ipv4 192.0.2.1 32 192.0.2.2
}

basic_tx_ipv6()
{
basic_tx_common "IPv6" ipv6 2001:db8:1::1 128 2001:db8:1::2
}

learning()
{
RET=0

# When learning is enabled on the VXLAN device, an incoming packet
# might try to refresh an FDB entry that points to an FDB nexthop group
# instead of an ordinary remote destination. Check that the kernel does
# not crash in this situation.

run_cmd "ip -n $ns1 address add 192.0.2.1/32 dev lo"
run_cmd "ip -n $ns1 address add 192.0.2.2/32 dev lo"

run_cmd "ip -n $ns1 nexthop add id 1 via 192.0.2.3 fdb"
run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb"

run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local 192.0.2.1 dstport 12345 localbypass"
run_cmd "ip -n $ns1 link add name vx1 up type vxlan id 10020 local 192.0.2.2 dstport 54321 learning"

run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static dst 192.0.2.2 port 54321 vni 10020"
run_cmd "bridge -n $ns1 fdb add 00:aa:bb:cc:dd:ee dev vx1 self static nhid 10"

run_cmd "ip netns exec $ns1 mausezahn vx0 -a 00:aa:bb:cc:dd:ee -b 00:11:22:33:44:55 -c 1 -q"

log_test "VXLAN FDB nexthop: learning"
}

proxy_common()
{
local af_str=$1; shift
local local_addr=$1; shift
local plen=$1; shift
local remote_addr=$1; shift
local neigh_addr=$1; shift
local ping_cmd=$1; shift

RET=0

# When the "proxy" option is enabled on the VXLAN device, the device
# will suppress ARP requests and IPv6 Neighbor Solicitation messages if
# it is able to reply on behalf of the remote host. That is, if a
# matching and valid neighbor entry is configured on the VXLAN device
# whose MAC address is not behind the "any" remote (0.0.0.0 / ::). The
# FDB entry for the neighbor's MAC address might point to an FDB
# nexthop group instead of an ordinary remote destination. Check that
# the kernel does not crash in this situation.

run_cmd "ip -n $ns1 address add $local_addr/$plen dev lo"

run_cmd "ip -n $ns1 nexthop add id 1 via $remote_addr fdb"
run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb"

run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local $local_addr dstport 4789 proxy"

run_cmd "ip -n $ns1 neigh add $neigh_addr lladdr 00:11:22:33:44:55 nud perm dev vx0"

run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static nhid 10"

run_cmd "ip netns exec $ns1 $ping_cmd"

log_test "VXLAN FDB nexthop: $af_str proxy"
}

proxy_ipv4()
{
proxy_common "IPv4" 192.0.2.1 32 192.0.2.2 192.0.2.3 \
"arping -b -c 1 -s 192.0.2.1 -I vx0 192.0.2.3"
}

proxy_ipv6()
{
proxy_common "IPv6" 2001:db8:1::1 128 2001:db8:1::2 2001:db8:1::3 \
"ndisc6 -r 1 -s 2001:db8:1::1 -w 1 2001:db8:1::3 vx0"
}

################################################################################
# Usage

usage()
{
cat <<EOF
usage: ${0##*/} OPTS
-t <test> Test(s) to run (default: all)
(options: $TESTS)
-p Pause on fail
-v Verbose mode (show commands and output)
EOF
}

################################################################################
# Main

while getopts ":t:pvh" opt; do
case $opt in
t) TESTS=$OPTARG;;
p) PAUSE_ON_FAIL=yes;;
v) VERBOSE=$((VERBOSE + 1));;
h) usage; exit 0;;
*) usage; exit 1;;
esac
done

require_command mausezahn
require_command arping
require_command ndisc6
require_command jq

if ! ip nexthop help 2>&1 | grep -q "stats"; then
echo "SKIP: iproute2 ip too old, missing nexthop stats support"
exit "$ksft_skip"
fi

trap exit_cleanup_all EXIT

for t in $TESTS
do
setup_ns ns1; $t; cleanup_all_ns;
done

0 comments on commit 41ec374

Please sign in to comment.