From 0c0a5ef809f9150e9229e7b13e43183b681b7a39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:16 -0700 Subject: [PATCH 01/10] tcp: move inet->rx_dst_ifindex to sk->sk_rx_dst_ifindex Increase cache locality by moving rx_dst_ifindex next to sk->sk_rx_dst This is part of an effort to reduce cache line misses in TCP fast path. This removes one cache line miss in early demux. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 3 +-- include/net/sock.h | 3 +++ net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv6/tcp_ipv6.c | 6 +++--- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 89163ef8cf4be..9e1111f5915bd 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -207,11 +207,10 @@ struct inet_sock { __be32 inet_saddr; __s16 uc_ttl; __u16 cmsg_flags; + struct ip_options_rcu __rcu *inet_opt; __be16 inet_sport; __u16 inet_id; - struct ip_options_rcu __rcu *inet_opt; - int rx_dst_ifindex; __u8 tos; __u8 min_ttl; __u8 mc_ttl; diff --git a/include/net/sock.h b/include/net/sock.h index 596ba85611bc7..0bfb3f138bdab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -259,6 +259,7 @@ struct bpf_local_storage; * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux + * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy @@ -430,6 +431,8 @@ struct sock { struct xfrm_policy __rcu *sk_policy[2]; #endif struct dst_entry *sk_rx_dst; + int sk_rx_dst_ifindex; + struct dst_entry __rcu *sk_dst_cache; atomic_t sk_omem_alloc; int sk_sndbuf; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5e6d8cb82ce5d..2bdc32c1afb65 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1703,7 +1703,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { - if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + if (sk->sk_rx_dst_ifindex != skb->skb_iif || !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, dst, 0)) { dst_release(dst); @@ -1788,7 +1788,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst && - inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) + sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } @@ -2195,7 +2195,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) if (dst && dst_hold_safe(dst)) { sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + sk->sk_rx_dst_ifindex = skb->skb_iif; } } EXPORT_SYMBOL(inet_sk_rx_dst_set); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1ef27cd7dbff7..3e8669b6d636c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -108,7 +108,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) const struct rt6_info *rt = (const struct rt6_info *)dst; sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + sk->sk_rx_dst_ifindex = skb->skb_iif; tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); } } @@ -1509,7 +1509,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { - if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, dst, np->rx_dst_cookie) == NULL) { dst_release(dst); @@ -1874,7 +1874,7 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie); if (dst && - inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) + sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } From ef57c1610dd8fba5031bf71e0db73356190de151 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:17 -0700 Subject: [PATCH 02/10] ipv6: move inet6_sk(sk)->rx_dst_cookie to sk->sk_rx_dst_cookie Increase cache locality by moving rx_dst_coookie next to sk->sk_rx_dst This removes one or two cache line misses in IPv6 early demux (TCP/UDP) Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 1 - include/net/sock.h | 2 ++ net/ipv6/tcp_ipv6.c | 6 +++--- net/ipv6/udp.c | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ef4a69865737c..c383630d3f065 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -282,7 +282,6 @@ struct ipv6_pinfo { __be32 rcv_flowinfo; __u32 dst_cookie; - __u32 rx_dst_cookie; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; diff --git a/include/net/sock.h b/include/net/sock.h index 0bfb3f138bdab..99c4194cb61ad 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -260,6 +260,7 @@ struct bpf_local_storage; * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst + * @sk_rx_dst_cookie: cookie for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy @@ -432,6 +433,7 @@ struct sock { #endif struct dst_entry *sk_rx_dst; int sk_rx_dst_ifindex; + u32 sk_rx_dst_cookie; struct dst_entry __rcu *sk_dst_cache; atomic_t sk_omem_alloc; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3e8669b6d636c..50d9578e945bd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -109,7 +109,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst = dst; sk->sk_rx_dst_ifindex = skb->skb_iif; - tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); + sk->sk_rx_dst_cookie = rt6_get_cookie(rt); } } @@ -1511,7 +1511,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (dst) { if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, - dst, np->rx_dst_cookie) == NULL) { + dst, sk->sk_rx_dst_cookie) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } @@ -1872,7 +1872,7 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) - dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie); + dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst && sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8d785232b4796..14a94cddcf0bc 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -884,7 +884,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) if (udp_sk_rx_dst_set(sk, dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); + sk->sk_rx_dst_cookie = rt6_get_cookie(rt); } } @@ -1073,7 +1073,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) dst = READ_ONCE(sk->sk_rx_dst); if (dst) - dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); + dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst) { /* set noref for now. * any place which wants to hold dst has to call From 2b13af8ade38ef3afe48d60c518c64010cfa8b0d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:18 -0700 Subject: [PATCH 03/10] net: avoid dirtying sk->sk_napi_id sk_napi_id is located in a cache line that can be kept read mostly. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 40296ed976a97..4202c609bb0b0 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -130,7 +130,8 @@ static inline void skb_mark_napi_id(struct sk_buff *skb, static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL - WRITE_ONCE(sk->sk_napi_id, skb->napi_id); + if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) + WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } From 342159ee394d103db4bea1b01f932c50e66be163 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:19 -0700 Subject: [PATCH 04/10] net: avoid dirtying sk->sk_rx_queue_mapping sk_rx_queue_mapping is located in a cache line that should be kept read mostly. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/sock.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 99c4194cb61ad..b4d3744b188ad 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1916,10 +1916,8 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); - if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING)) - return; - - sk->sk_rx_queue_mapping = rx_queue; + if (unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) + WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } From 09b898466792b0c387040e6df90a16e7a9f2a5d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:20 -0700 Subject: [PATCH 05/10] net: annotate accesses to sk->sk_rx_queue_mapping sk->sk_rx_queue_mapping can be modified locklessly, add a couple of READ_ONCE()/WRITE_ONCE() to document this fact. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/sock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index b4d3744b188ad..b76be30674efc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1925,15 +1925,19 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING - sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; + WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING); #endif } static inline int sk_rx_queue_get(const struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING - if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) - return sk->sk_rx_queue_mapping; + if (sk) { + int res = READ_ONCE(sk->sk_rx_queue_mapping); + + if (res != NO_QUEUE_MAPPING) + return res; + } #endif return -1; From cc17c3c8e8b5beb4072c0e8e53aeb77bcf4517c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:21 -0700 Subject: [PATCH 06/10] ipv6: annotate data races around np->min_hopcount No report yet from KCSAN, yet worth documenting the races. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- net/ipv6/ipv6_sockglue.c | 5 ++++- net/ipv6/tcp_ipv6.c | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e4bdb09c55867..9c3d28764b5c3 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -950,7 +950,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto e_inval; if (val < 0 || val > 255) goto e_inval; - np->min_hopcount = val; + /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount + * while we are changing it. + */ + WRITE_ONCE(np->min_hopcount, val); retv = 0; break; case IPV6_DONTFRAG: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 50d9578e945bd..c93b2d48bb89a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -414,7 +414,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state == TCP_CLOSE) goto out; - if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) { + /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ + if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } @@ -1726,7 +1727,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) return 0; } } - if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) { + /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ + if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } From 790eb67374d43f66102a80821d22188b6a0bc3bb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:22 -0700 Subject: [PATCH 07/10] ipv6: guard IPV6_MINHOPCOUNT with a static key RFC 5082 IPV6_MINHOPCOUNT is rarely used on hosts. Add a static key to remove from TCP fast path useless code, and potential cache line miss to fetch tcp_inet6_sk(sk)->min_hopcount Note that once ip6_min_hopcount static key has been enabled, it stays enabled until next boot. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 1 + net/ipv6/ipv6_sockglue.c | 6 ++++++ net/ipv6/tcp_ipv6.c | 21 +++++++++++++-------- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f2d0ecc257bb2..c19bf51ded1d0 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1092,6 +1092,7 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, /* * socket options (ipv6_sockglue.c) */ +DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount); int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 9c3d28764b5c3..41efca817db42 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -55,6 +55,8 @@ struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); +DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); + int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; @@ -950,6 +952,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto e_inval; if (val < 0 || val > 255) goto e_inval; + + if (val) + static_branch_enable(&ip6_min_hopcount); + /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c93b2d48bb89a..6945583c0fa48 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -414,10 +414,12 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state == TCP_CLOSE) goto out; - /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ - if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { - __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); - goto out; + if (static_branch_unlikely(&ip6_min_hopcount)) { + /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ + if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } } tp = tcp_sk(sk); @@ -1727,10 +1729,13 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) return 0; } } - /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ - if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { - __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); - goto discard_and_relse; + + if (static_branch_unlikely(&ip6_min_hopcount)) { + /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ + if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } } if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) From 14834c4f4eb3c8c0af40f6203dbf09d232044d9d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:23 -0700 Subject: [PATCH 08/10] ipv4: annotate data races arount inet->min_ttl No report yet from KCSAN, yet worth documenting the races. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- net/ipv4/ip_sockglue.c | 5 ++++- net/ipv4/tcp_ipv4.c | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b297bb28556ec..d5487c8580674 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1352,7 +1352,10 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; if (val < 0 || val > 255) goto e_inval; - inet->min_ttl = val; + /* tcp_v4_err() and tcp_v4_rcv() might read min_ttl + * while we are changint it. + */ + WRITE_ONCE(inet->min_ttl, val); break; default: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2bdc32c1afb65..a9cbc8e6b7962 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -508,7 +508,8 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) if (sk->sk_state == TCP_CLOSE) goto out; - if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + /* min_ttl can be changed concurrently from do_ip_setsockopt() */ + if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } @@ -2068,7 +2069,9 @@ int tcp_v4_rcv(struct sk_buff *skb) return 0; } } - if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + + /* min_ttl can be changed concurrently from do_ip_setsockopt() */ + if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } From 020e71a3cf7f50c0f2c54cf2444067b76fe6d785 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:24 -0700 Subject: [PATCH 09/10] ipv4: guard IP_MINTTL with a static key RFC 5082 IP_MINTTL option is rarely used on hosts. Add a static key to remove from TCP fast path useless code, and potential cache line miss to fetch inet_sk(sk)->min_ttl Note that once ip4_min_ttl static key has been enabled, it stays enabled until next boot. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/ip.h | 2 ++ net/ipv4/ip_sockglue.c | 6 ++++++ net/ipv4/tcp_ipv4.c | 20 ++++++++++++-------- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index cf229a5311942..b71e88507c4a0 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -750,6 +751,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); +DECLARE_STATIC_KEY_FALSE(ip4_min_ttl); int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d5487c8580674..38d29b175ca66 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -886,6 +886,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname, return ip_mc_leave_group(sk, &mreq); } +DEFINE_STATIC_KEY_FALSE(ip4_min_ttl); + static int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -1352,6 +1354,10 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; if (val < 0 || val > 255) goto e_inval; + + if (val) + static_branch_enable(&ip4_min_ttl); + /* tcp_v4_err() and tcp_v4_rcv() might read min_ttl * while we are changint it. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a9cbc8e6b7962..13d868c432845 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -508,10 +508,12 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) if (sk->sk_state == TCP_CLOSE) goto out; - /* min_ttl can be changed concurrently from do_ip_setsockopt() */ - if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { - __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); - goto out; + if (static_branch_unlikely(&ip4_min_ttl)) { + /* min_ttl can be changed concurrently from do_ip_setsockopt() */ + if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } } tp = tcp_sk(sk); @@ -2070,10 +2072,12 @@ int tcp_v4_rcv(struct sk_buff *skb) } } - /* min_ttl can be changed concurrently from do_ip_setsockopt() */ - if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { - __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); - goto discard_and_relse; + if (static_branch_unlikely(&ip4_min_ttl)) { + /* min_ttl can be changed concurrently from do_ip_setsockopt() */ + if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) From 12c8691de30734a29b84cae35a4bf1cccca6ad0a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:25 -0700 Subject: [PATCH 10/10] ipv6/tcp: small drop monitor changes Two kfree_skb() calls must be replaced by consume_skb() for skbs that are not technically dropped. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- net/ipv6/tcp_ipv6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6945583c0fa48..c678e778c1fb8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -572,7 +572,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->ipv6_opt); - kfree_skb(inet_rsk(req)->pktopts); + consume_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG @@ -1594,7 +1594,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - kfree_skb(opt_skb); + consume_skb(opt_skb); return 0; }