Skip to content

Commit

Permalink
Merge branch 'dst-hint-multipath'
Browse files Browse the repository at this point in the history
Sriram Yagnaraman says:

====================
Avoid TCP resets when using ECMP for load-balancing between multiple servers.

All packets in the same flow (L3/L4 depending on multipath hash policy)
should be directed to the same target, but after [0]/[1] we see stray
packets directed towards other targets. This, for instance, causes RST
to be sent on TCP connections.

The first two patches solve the problem by ignoring route hints for
destinations that are part of multipath group, by using new SKB flags
for IPv4 and IPv6. The third patch is a selftest that tests the
scenario.

Thanks to Ido, for reviewing and suggesting a way forward in [2] and
also suggesting how to write a selftest for this.

v4->v5:
- Fixed review comments from Ido
v3->v4:
- Remove single path test
- Rebase to latest
v2->v3:
- Add NULL check for skb in fib6_select_path (Ido Schimmel)
- Use fib_tests.sh for selftest instead of the forwarding suite (Ido
  Schimmel)
v1->v2:
- Update to commit messages describing the solution (Ido Schimmel)
- Use perf stat to count fib table lookups in selftest (Ido Schimmel)
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Sep 1, 2023
2 parents 2ea3528 + 8ae9efb commit d8a3070
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 3 deletions.
1 change: 1 addition & 0 deletions include/linux/ipv6.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ struct inet6_skb_parm {
#define IP6SKB_JUMBOGRAM 128
#define IP6SKB_SEG6 256
#define IP6SKB_FAKEJUMBO 512
#define IP6SKB_MULTIPATH 1024
};

#if defined(CONFIG_NET_L3_MASTER_DEV)
Expand Down
1 change: 1 addition & 0 deletions include/net/ip.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct inet_skb_parm {
#define IPSKB_FRAG_PMTU BIT(6)
#define IPSKB_L3SLAVE BIT(7)
#define IPSKB_NOPOLICY BIT(8)
#define IPSKB_MULTIPATH BIT(9)

u16 frag_max_size;
};
Expand Down
3 changes: 2 additions & 1 deletion net/ipv4/ip_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,8 @@ static void ip_sublist_rcv_finish(struct list_head *head)
static struct sk_buff *ip_extract_route_hint(const struct net *net,
struct sk_buff *skb, int rt_type)
{
if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
IPCB(skb)->flags & IPSKB_MULTIPATH)
return NULL;

return skb;
Expand Down
1 change: 1 addition & 0 deletions net/ipv4/route.c
Original file line number Diff line number Diff line change
Expand Up @@ -2144,6 +2144,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);

fib_select_multipath(res, h);
IPCB(skb)->flags |= IPSKB_MULTIPATH;
}
#endif

Expand Down
3 changes: 2 additions & 1 deletion net/ipv6/ip6_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ static bool ip6_can_use_hint(const struct sk_buff *skb,
static struct sk_buff *ip6_extract_route_hint(const struct net *net,
struct sk_buff *skb)
{
if (fib6_routes_require_src(net) || fib6_has_custom_rules(net))
if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) ||
IP6CB(skb)->flags & IP6SKB_MULTIPATH)
return NULL;

return skb;
Expand Down
3 changes: 3 additions & 0 deletions net/ipv6/route.c
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,9 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
if (match->nh && have_oif_match && res->nh)
return;

if (skb)
IP6CB(skb)->flags |= IP6SKB_MULTIPATH;

/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
Expand Down
155 changes: 154 additions & 1 deletion tools/testing/selftests/net/fib_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ ksft_skip=4
TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \
ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \
ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \
ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test"
ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \
ipv4_mpath_list ipv6_mpath_list"

VERBOSE=0
PAUSE_ON_FAIL=no
Expand Down Expand Up @@ -2352,6 +2353,156 @@ ipv4_bcast_neigh_test()
cleanup
}

mpath_dep_check()
{
if [ ! -x "$(command -v mausezahn)" ]; then
echo "mausezahn command not found. Skipping test"
return 1
fi

if [ ! -x "$(command -v jq)" ]; then
echo "jq command not found. Skipping test"
return 1
fi

if [ ! -x "$(command -v bc)" ]; then
echo "bc command not found. Skipping test"
return 1
fi

if [ ! -x "$(command -v perf)" ]; then
echo "perf command not found. Skipping test"
return 1
fi

perf list fib:* | grep -q fib_table_lookup
if [ $? -ne 0 ]; then
echo "IPv4 FIB tracepoint not found. Skipping test"
return 1
fi

perf list fib6:* | grep -q fib6_table_lookup
if [ $? -ne 0 ]; then
echo "IPv6 FIB tracepoint not found. Skipping test"
return 1
fi

return 0
}

link_stats_get()
{
local ns=$1; shift
local dev=$1; shift
local dir=$1; shift
local stat=$1; shift

ip -n $ns -j -s link show dev $dev \
| jq '.[]["stats64"]["'$dir'"]["'$stat'"]'
}

list_rcv_eval()
{
local file=$1; shift
local expected=$1; shift

local count=$(tail -n 1 $file | jq '.["counter-value"] | tonumber | floor')
local ratio=$(echo "scale=2; $count / $expected" | bc -l)
local res=$(echo "$ratio >= 0.95" | bc)
[[ $res -eq 1 ]]
log_test $? 0 "Multipath route hit ratio ($ratio)"
}

ipv4_mpath_list_test()
{
echo
echo "IPv4 multipath list receive tests"

mpath_dep_check || return 1

route_setup

set -e
run_cmd "ip netns exec ns1 ethtool -K veth1 tcp-segmentation-offload off"

run_cmd "ip netns exec ns2 bash -c \"echo 20000 > /sys/class/net/veth2/gro_flush_timeout\""
run_cmd "ip netns exec ns2 bash -c \"echo 1 > /sys/class/net/veth2/napi_defer_hard_irqs\""
run_cmd "ip netns exec ns2 ethtool -K veth2 generic-receive-offload on"
run_cmd "ip -n ns2 link add name nh1 up type dummy"
run_cmd "ip -n ns2 link add name nh2 up type dummy"
run_cmd "ip -n ns2 address add 172.16.201.1/24 dev nh1"
run_cmd "ip -n ns2 address add 172.16.202.1/24 dev nh2"
run_cmd "ip -n ns2 neigh add 172.16.201.2 lladdr 00:11:22:33:44:55 nud perm dev nh1"
run_cmd "ip -n ns2 neigh add 172.16.202.2 lladdr 00:aa:bb:cc:dd:ee nud perm dev nh2"
run_cmd "ip -n ns2 route add 203.0.113.0/24
nexthop via 172.16.201.2 nexthop via 172.16.202.2"
run_cmd "ip netns exec ns2 sysctl -qw net.ipv4.fib_multipath_hash_policy=1"
set +e

local dmac=$(ip -n ns2 -j link show dev veth2 | jq -r '.[]["address"]')
local tmp_file=$(mktemp)
local cmd="ip netns exec ns1 mausezahn veth1 -a own -b $dmac
-A 172.16.101.1 -B 203.0.113.1 -t udp 'sp=12345,dp=0-65535' -q"

# Packets forwarded in a list using a multipath route must not reuse a
# cached result so that a flow always hits the same nexthop. In other
# words, the FIB lookup tracepoint needs to be triggered for every
# packet.
local t0_rx_pkts=$(link_stats_get ns2 veth2 rx packets)
run_cmd "perf stat -e fib:fib_table_lookup --filter 'err == 0' -j -o $tmp_file -- $cmd"
local t1_rx_pkts=$(link_stats_get ns2 veth2 rx packets)
local diff=$(echo $t1_rx_pkts - $t0_rx_pkts | bc -l)
list_rcv_eval $tmp_file $diff

rm $tmp_file
route_cleanup
}

ipv6_mpath_list_test()
{
echo
echo "IPv6 multipath list receive tests"

mpath_dep_check || return 1

route_setup

set -e
run_cmd "ip netns exec ns1 ethtool -K veth1 tcp-segmentation-offload off"

run_cmd "ip netns exec ns2 bash -c \"echo 20000 > /sys/class/net/veth2/gro_flush_timeout\""
run_cmd "ip netns exec ns2 bash -c \"echo 1 > /sys/class/net/veth2/napi_defer_hard_irqs\""
run_cmd "ip netns exec ns2 ethtool -K veth2 generic-receive-offload on"
run_cmd "ip -n ns2 link add name nh1 up type dummy"
run_cmd "ip -n ns2 link add name nh2 up type dummy"
run_cmd "ip -n ns2 -6 address add 2001:db8:201::1/64 dev nh1"
run_cmd "ip -n ns2 -6 address add 2001:db8:202::1/64 dev nh2"
run_cmd "ip -n ns2 -6 neigh add 2001:db8:201::2 lladdr 00:11:22:33:44:55 nud perm dev nh1"
run_cmd "ip -n ns2 -6 neigh add 2001:db8:202::2 lladdr 00:aa:bb:cc:dd:ee nud perm dev nh2"
run_cmd "ip -n ns2 -6 route add 2001:db8:301::/64
nexthop via 2001:db8:201::2 nexthop via 2001:db8:202::2"
run_cmd "ip netns exec ns2 sysctl -qw net.ipv6.fib_multipath_hash_policy=1"
set +e

local dmac=$(ip -n ns2 -j link show dev veth2 | jq -r '.[]["address"]')
local tmp_file=$(mktemp)
local cmd="ip netns exec ns1 mausezahn -6 veth1 -a own -b $dmac
-A 2001:db8:101::1 -B 2001:db8:301::1 -t udp 'sp=12345,dp=0-65535' -q"

# Packets forwarded in a list using a multipath route must not reuse a
# cached result so that a flow always hits the same nexthop. In other
# words, the FIB lookup tracepoint needs to be triggered for every
# packet.
local t0_rx_pkts=$(link_stats_get ns2 veth2 rx packets)
run_cmd "perf stat -e fib6:fib6_table_lookup --filter 'err == 0' -j -o $tmp_file -- $cmd"
local t1_rx_pkts=$(link_stats_get ns2 veth2 rx packets)
local diff=$(echo $t1_rx_pkts - $t0_rx_pkts | bc -l)
list_rcv_eval $tmp_file $diff

rm $tmp_file
route_cleanup
}

################################################################################
# usage

Expand Down Expand Up @@ -2433,6 +2584,8 @@ do
ipv6_mangle) ipv6_mangle_test;;
ipv4_bcast_neigh) ipv4_bcast_neigh_test;;
fib6_gc_test|ipv6_gc) fib6_gc_test;;
ipv4_mpath_list) ipv4_mpath_list_test;;
ipv6_mpath_list) ipv6_mpath_list_test;;

help) echo "Test names: $TESTS"; exit 0;;
esac
Expand Down

0 comments on commit d8a3070

Please sign in to comment.