From b0adfba7ee770fef20b1b6d86706c28f7fccfb07 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:01:59 +0000 Subject: [PATCH 01/14] ipv6: lockless IPV6_UNICAST_HOPS implementation Some np->hop_limit accesses are racy, when socket lock is not held. Add missing annotations and switch to full lockless implementation. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 12 +----------- include/net/ipv6.h | 2 +- net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 20 +++++++++++--------- net/ipv6/mcast.c | 2 +- net/ipv6/ndisc.c | 2 +- 6 files changed, 16 insertions(+), 24 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index af8a771a053c5..c2e0870713849 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -213,17 +213,7 @@ struct ipv6_pinfo { __be32 flow_label; __u32 frag_size; - /* - * Packed in 16bits. - * Omit one shift by putting the signed field at MSB. - */ -#if defined(__BIG_ENDIAN_BITFIELD) - __s16 hop_limit:9; - __u16 __unused_1:7; -#else - __u16 __unused_1:7; - __s16 hop_limit:9; -#endif + s16 hop_limit; #if defined(__BIG_ENDIAN_BITFIELD) /* Packed in 16bits. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c6932d1a3fa80..2e8e7e31e02ea 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -916,7 +916,7 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, if (ipv6_addr_is_multicast(&fl6->daddr)) hlimit = np->mcast_hops; else - hlimit = np->hop_limit; + hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); return hlimit; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 54fc4c711f2c5..1e16d56d8c38a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -309,7 +309,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, * Fill in the IPv6 header */ if (np) - hlimit = np->hop_limit; + hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 0e2a0847b387f..f27993a1470dd 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -415,6 +415,16 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); + /* Handle options that can be set without locking the socket. */ + switch (optname) { + case IPV6_UNICAST_HOPS: + if (optlen < sizeof(int)) + return -EINVAL; + if (val > 255 || val < -1) + return -EINVAL; + WRITE_ONCE(np->hop_limit, val); + return 0; + } if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); @@ -733,14 +743,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, } break; } - case IPV6_UNICAST_HOPS: - if (optlen < sizeof(int)) - goto e_inval; - if (val > 255 || val < -1) - goto e_inval; - np->hop_limit = val; - retv = 0; - break; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) @@ -1347,7 +1349,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) - val = np->hop_limit; + val = READ_ONCE(np->hop_limit); else val = np->mcast_hops; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 421264a69e972..4a79676239094 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1716,7 +1716,7 @@ static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, hdr->payload_len = htons(len); hdr->nexthdr = proto; - hdr->hop_limit = inet6_sk(sk)->hop_limit; + hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit); hdr->saddr = *saddr; hdr->daddr = *daddr; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 553c8664e0a7a..b554fd40bdc37 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -500,7 +500,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, csum_partial(icmp6h, skb->len, 0)); - ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len); + ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); rcu_read_lock(); idev = __in6_dev_get(dst->dev); From d986f52124e062753e33b6fe303be5904a997eac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:00 +0000 Subject: [PATCH 02/14] ipv6: lockless IPV6_MULTICAST_LOOP implementation Add inet6_{test|set|clear|assign}_bit() helpers. Note that I am using bits from inet->inet_flags, this might change in the future if we need more flags. While solving data-races accessing np->mc_loop, this patch also allows to implement lockless accesses to np->mcast_hops in the following patch. Also constify sk_mc_loop() argument. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 18 ++++++++++++++---- include/net/inet_sock.h | 1 + include/net/sock.h | 2 +- net/core/sock.c | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/ipv6_sockglue.c | 18 ++++++++---------- net/ipv6/ndisc.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 8 ++------ 8 files changed, 30 insertions(+), 25 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index c2e0870713849..68cf1ca949141 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -218,11 +218,9 @@ struct ipv6_pinfo { #if defined(__BIG_ENDIAN_BITFIELD) /* Packed in 16bits. */ __s16 mcast_hops:9; - __u16 __unused_2:6, - mc_loop:1; + __u16 __unused_2:7, #else - __u16 mc_loop:1, - __unused_2:6; + __u16 __unused_2:7; __s16 mcast_hops:9; #endif int ucast_oif; @@ -283,6 +281,18 @@ struct ipv6_pinfo { struct inet6_cork cork; }; +/* We currently use available bits from inet_sk(sk)->inet_flags, + * this could change in the future. + */ +#define inet6_test_bit(nr, sk) \ + test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet6_set_bit(nr, sk) \ + set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet6_clear_bit(nr, sk) \ + clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet6_assign_bit(nr, sk, val) \ + assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) + /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */ struct raw6_sock { /* inet_sock has to be the first member of raw6_sock */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 2de0e4d4a0278..b5a9dca92fb45 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -268,6 +268,7 @@ enum { INET_FLAGS_NODEFRAG = 17, INET_FLAGS_BIND_ADDRESS_NO_PORT = 18, INET_FLAGS_DEFER_CONNECT = 19, + INET_FLAGS_MC6_LOOP = 20, }; /* cmsg flags for inet */ diff --git a/include/net/sock.h b/include/net/sock.h index 676146e9d1811..56ac1abadea59 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2238,7 +2238,7 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) } } -bool sk_mc_loop(struct sock *sk); +bool sk_mc_loop(const struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { diff --git a/net/core/sock.c b/net/core/sock.c index bb89b88bc1e8a..213a62ac13f2e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -759,7 +759,7 @@ static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, return ret; } -bool sk_mc_loop(struct sock *sk) +bool sk_mc_loop(const struct sock *sk) { if (dev_recursion_level()) return false; @@ -771,7 +771,7 @@ bool sk_mc_loop(struct sock *sk) return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - return inet6_sk(sk)->mc_loop; + return inet6_test_bit(MC6_LOOP, sk); #endif } WARN_ON_ONCE(1); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 368824fe9719f..bbd4aa1b96d09 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -217,7 +217,7 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); np->hop_limit = -1; np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; - np->mc_loop = 1; + inet6_set_bit(MC6_LOOP, sk); np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index f27993a1470dd..755fac85a120d 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -424,6 +424,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; WRITE_ONCE(np->hop_limit, val); return 0; + case IPV6_MULTICAST_LOOP: + if (optlen < sizeof(int)) + return -EINVAL; + if (val != valbool) + return -EINVAL; + inet6_assign_bit(MC6_LOOP, sk, valbool); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -755,15 +762,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; - case IPV6_MULTICAST_LOOP: - if (optlen < sizeof(int)) - goto e_inval; - if (val != valbool) - goto e_inval; - np->mc_loop = valbool; - retv = 0; - break; - case IPV6_UNICAST_IF: { struct net_device *dev = NULL; @@ -1367,7 +1365,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } case IPV6_MULTICAST_LOOP: - val = np->mc_loop; + val = inet6_test_bit(MC6_LOOP, sk); break; case IPV6_MULTICAST_IF: diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b554fd40bdc37..679443d7ecb58 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1996,7 +1996,7 @@ static int __net_init ndisc_net_init(struct net *net) np = inet6_sk(sk); np->hop_limit = 255; /* Do not loopback ndisc messages */ - np->mc_loop = 0; + inet6_clear_bit(MC6_LOOP, sk); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index da5af28ff57b5..3c2251cabd043 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1298,17 +1298,13 @@ static void set_sock_size(struct sock *sk, int mode, int val) static void set_mcast_loop(struct sock *sk, u_char loop) { /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); inet_assign_bit(MC_LOOP, sk, loop); #ifdef CONFIG_IP_VS_IPV6 - if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - + if (READ_ONCE(sk->sk_family) == AF_INET6) { /* IPV6_MULTICAST_LOOP */ - np->mc_loop = loop ? 1 : 0; + inet6_assign_bit(MC6_LOOP, sk, loop); } #endif - release_sock(sk); } /* From 2da23eb07c91241d962f3ff05565065484cd8929 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:01 +0000 Subject: [PATCH 03/14] ipv6: lockless IPV6_MULTICAST_HOPS implementation This fixes data-races around np->mcast_hops, and make IPV6_MULTICAST_HOPS lockless. Note that np->mcast_hops is never negative, thus can fit an u8 field instead of s16. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 9 +-------- include/net/ipv6.h | 2 +- net/dccp/ipv6.c | 2 +- net/ipv6/ipv6_sockglue.c | 28 +++++++++++++++------------- net/ipv6/tcp_ipv6.c | 3 ++- net/netfilter/ipvs/ip_vs_sync.c | 2 +- 6 files changed, 21 insertions(+), 25 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 68cf1ca949141..9cc278b5e4f42 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -214,15 +214,8 @@ struct ipv6_pinfo { __u32 frag_size; s16 hop_limit; + u8 mcast_hops; -#if defined(__BIG_ENDIAN_BITFIELD) - /* Packed in 16bits. */ - __s16 mcast_hops:9; - __u16 __unused_2:7, -#else - __u16 __unused_2:7; - __s16 mcast_hops:9; -#endif int ucast_oif; int mcast_oif; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2e8e7e31e02ea..8a04a89853367 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -914,7 +914,7 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, int hlimit; if (ipv6_addr_is_multicast(&fl6->daddr)) - hlimit = np->mcast_hops; + hlimit = READ_ONCE(np->mcast_hops); else hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 33f6ccf6ba77b..83617a16b98e7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -676,7 +676,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) np->mcast_oif = inet6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; + WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (np->repflow) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 755fac85a120d..5fff19a87c755 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -431,6 +431,16 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet6_assign_bit(MC6_LOOP, sk, valbool); return 0; + case IPV6_MULTICAST_HOPS: + if (sk->sk_type == SOCK_STREAM) + return retv; + if (optlen < sizeof(int)) + return -EINVAL; + if (val > 255 || val < -1) + return -EINVAL; + WRITE_ONCE(np->mcast_hops, + val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -751,16 +761,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; } - case IPV6_MULTICAST_HOPS: - if (sk->sk_type == SOCK_STREAM) - break; - if (optlen < sizeof(int)) - goto e_inval; - if (val > 255 || val < -1) - goto e_inval; - np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); - retv = 0; - break; case IPV6_UNICAST_IF: { @@ -1180,7 +1180,8 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { - int hlim = np->mcast_hops; + int hlim = READ_ONCE(np->mcast_hops); + put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { @@ -1197,7 +1198,8 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { - int hlim = np->mcast_hops; + int hlim = READ_ONCE(np->mcast_hops); + put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { @@ -1349,7 +1351,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_UNICAST_HOPS) val = READ_ONCE(np->hop_limit); else - val = np->mcast_hops; + val = READ_ONCE(np->mcast_hops); if (val < 0) { rcu_read_lock(); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3a88545a265d6..54db5fab318bc 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1542,7 +1542,8 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) np->mcast_oif = tcp_v6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; + WRITE_ONCE(np->mcast_hops, + ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (np->repflow) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3c2251cabd043..df1b33b61059e 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1322,7 +1322,7 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MULTICAST_HOPS */ - np->mcast_hops = ttl; + WRITE_ONCE(np->mcast_hops, ttl); } #endif release_sock(sk); From 15f926c4457aa65b1ac83bda1bbdcaad3f48e4e7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:02 +0000 Subject: [PATCH 04/14] ipv6: lockless IPV6_MTU implementation np->frag_size can be read/written without holding socket lock. Add missing annotations and make IPV6_MTU setsockopt() lockless. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 19 +++++++++++-------- net/ipv6/ipv6_sockglue.c | 15 +++++++-------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1e16d56d8c38a..ab7ede4a731a9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -881,9 +881,11 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, mtu = IPV6_MIN_MTU; } - if (np && np->frag_size < mtu) { - if (np->frag_size) - mtu = np->frag_size; + if (np) { + u32 frag_size = READ_ONCE(np->frag_size); + + if (frag_size && frag_size < mtu) + mtu = frag_size; } if (mtu < hlen + sizeof(struct frag_hdr) + 8) goto fail_toobig; @@ -1392,7 +1394,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct rt6_info *rt) { struct ipv6_pinfo *np = inet6_sk(sk); - unsigned int mtu; + unsigned int mtu, frag_size; struct ipv6_txoptions *nopt, *opt = ipc6->opt; /* callers pass dst together with a reference, set it first so @@ -1441,10 +1443,11 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); - if (np->frag_size < mtu) { - if (np->frag_size) - mtu = np->frag_size; - } + + frag_size = READ_ONCE(np->frag_size); + if (frag_size && frag_size < mtu) + mtu = frag_size; + cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 5fff19a87c755..3b2a34828daab 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -441,6 +441,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(np->mcast_hops, val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); return 0; + case IPV6_MTU: + if (optlen < sizeof(int)) + return -EINVAL; + if (val && val < IPV6_MIN_MTU) + return -EINVAL; + WRITE_ONCE(np->frag_size, val); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -910,14 +917,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, np->pmtudisc = val; retv = 0; break; - case IPV6_MTU: - if (optlen < sizeof(int)) - goto e_inval; - if (val && val < IPV6_MIN_MTU) - goto e_inval; - np->frag_size = val; - retv = 0; - break; case IPV6_RECVERR: if (optlen < sizeof(int)) goto e_inval; From 273784d3c5741522199011772651dbb50db8c810 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:03 +0000 Subject: [PATCH 05/14] ipv6: lockless IPV6_MINHOPCOUNT implementation Add one missing READ_ONCE() annotation in do_ipv6_getsockopt() and make IPV6_MINHOPCOUNT setsockopt() lockless. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3b2a34828daab..bbc8a009e05d3 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -448,6 +448,20 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; WRITE_ONCE(np->frag_size, val); return 0; + case IPV6_MINHOPCOUNT: + if (optlen < sizeof(int)) + return -EINVAL; + if (val < 0 || val > 255) + return -EINVAL; + + if (val) + static_branch_enable(&ip6_min_hopcount); + + /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount + * while we are changing it. + */ + WRITE_ONCE(np->min_hopcount, val); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -947,21 +961,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto e_inval; retv = __ip6_sock_set_addr_preferences(sk, val); break; - case IPV6_MINHOPCOUNT: - if (optlen < sizeof(int)) - goto e_inval; - if (val < 0 || val > 255) - goto e_inval; - - if (val) - static_branch_enable(&ip6_min_hopcount); - - /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount - * while we are changing it. - */ - WRITE_ONCE(np->min_hopcount, val); - retv = 0; - break; case IPV6_DONTFRAG: np->dontfrag = valbool; retv = 0; @@ -1443,7 +1442,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_MINHOPCOUNT: - val = np->min_hopcount; + val = READ_ONCE(np->min_hopcount); break; case IPV6_DONTFRAG: From dcae74622c051b219ee628669a31716473efda2c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:04 +0000 Subject: [PATCH 06/14] ipv6: lockless IPV6_RECVERR_RFC4884 implementation Move np->recverr_rfc4884 to an atomic flag to fix data-races. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 - include/net/inet_sock.h | 1 + net/ipv6/datagram.c | 2 +- net/ipv6/ipv6_sockglue.c | 17 ++++++++--------- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 9cc278b5e4f42..0d2b0a1b2daea 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -256,7 +256,6 @@ struct ipv6_pinfo { autoflowlabel:1, autoflowlabel_set:1, mc_all:1, - recverr_rfc4884:1, rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index b5a9dca92fb45..8cf1f7b442348 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -269,6 +269,7 @@ enum { INET_FLAGS_BIND_ADDRESS_NO_PORT = 18, INET_FLAGS_DEFER_CONNECT = 19, INET_FLAGS_MC6_LOOP = 20, + INET_FLAGS_RECVERR6_RFC4884 = 21, }; /* cmsg flags for inet */ diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 41ebc4e574734..e81892814935f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -332,7 +332,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __skb_pull(skb, payload - skb->data); - if (inet6_sk(sk)->recverr_rfc4884) + if (inet6_test_bit(RECVERR6_RFC4884, sk)) ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index bbc8a009e05d3..b65e73ac2ccde 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -462,6 +462,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, */ WRITE_ONCE(np->min_hopcount, val); return 0; + case IPV6_RECVERR_RFC4884: + if (optlen < sizeof(int)) + return -EINVAL; + if (val < 0 || val > 1) + return -EINVAL; + inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -974,14 +981,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, np->rxopt.bits.recvfragsize = valbool; retv = 0; break; - case IPV6_RECVERR_RFC4884: - if (optlen < sizeof(int)) - goto e_inval; - if (val < 0 || val > 1) - goto e_inval; - np->recverr_rfc4884 = valbool; - retv = 0; - break; } unlock: @@ -1462,7 +1461,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_RECVERR_RFC4884: - val = np->recverr_rfc4884; + val = inet6_test_bit(RECVERR6_RFC4884, sk); break; default: From 6559c0ff3bc27d7e4d447d31c1d7e8eae0e959f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:05 +0000 Subject: [PATCH 07/14] ipv6: lockless IPV6_MULTICAST_ALL implementation Move np->mc_all to an atomic flags to fix data-races. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 - include/net/inet_sock.h | 1 + net/ipv6/af_inet6.c | 2 +- net/ipv6/ipv6_sockglue.c | 14 ++++++-------- net/ipv6/mcast.c | 2 +- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 0d2b0a1b2daea..d88e91b7f0a31 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -255,7 +255,6 @@ struct ipv6_pinfo { dontfrag:1, autoflowlabel:1, autoflowlabel_set:1, - mc_all:1, rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 8cf1f7b442348..97e70a97dae88 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -270,6 +270,7 @@ enum { INET_FLAGS_DEFER_CONNECT = 19, INET_FLAGS_MC6_LOOP = 20, INET_FLAGS_RECVERR6_RFC4884 = 21, + INET_FLAGS_MC6_ALL = 22, }; /* cmsg flags for inet */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index bbd4aa1b96d09..372fb7b9112c8 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -218,7 +218,7 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, np->hop_limit = -1; np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; inet6_set_bit(MC6_LOOP, sk); - np->mc_all = 1; + inet6_set_bit(MC6_ALL, sk); np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b65e73ac2ccde..7a181831f226c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -469,6 +469,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); return 0; + case IPV6_MULTICAST_ALL: + if (optlen < sizeof(int)) + return -EINVAL; + inet6_assign_bit(MC6_ALL, sk, valbool); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -890,13 +895,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } - case IPV6_MULTICAST_ALL: - if (optlen < sizeof(int)) - goto e_inval; - np->mc_all = valbool; - retv = 0; - break; - case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) @@ -1372,7 +1370,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_MULTICAST_ALL: - val = np->mc_all; + val = inet6_test_bit(MC6_ALL, sk); break; case IPV6_UNICAST_IF: diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 4a79676239094..99e28b444a4c7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -642,7 +642,7 @@ bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, } if (!mc) { rcu_read_unlock(); - return np->mc_all; + return inet6_test_bit(MC6_ALL, sk); } psl = rcu_dereference(mc->sflist); if (!psl) { From 5121516b0c4736b7977d977b239e36d23ec64401 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:06 +0000 Subject: [PATCH 08/14] ipv6: lockless IPV6_AUTOFLOWLABEL implementation Move np->autoflowlabel and np->autoflowlabel_set in inet->inet_flags, to fix data-races. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 -- include/net/inet_sock.h | 2 ++ include/net/ipv6.h | 2 +- net/ipv6/ip6_output.c | 12 +++++------- net/ipv6/ipv6_sockglue.c | 11 +++++------ 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index d88e91b7f0a31..e3be5dc21b7d2 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -253,8 +253,6 @@ struct ipv6_pinfo { * 100: prefer care-of address */ dontfrag:1, - autoflowlabel:1, - autoflowlabel_set:1, rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 97e70a97dae88..f1af64a406731 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -271,6 +271,8 @@ enum { INET_FLAGS_MC6_LOOP = 20, INET_FLAGS_RECVERR6_RFC4884 = 21, INET_FLAGS_MC6_ALL = 22, + INET_FLAGS_AUTOFLOWLABEL_SET = 23, + INET_FLAGS_AUTOFLOWLABEL = 24, }; /* cmsg flags for inet */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 8a04a89853367..4b6cbec059e25 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -428,7 +428,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); -bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); +bool ip6_autoflowlabel(struct net *net, const struct sock *sk); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ab7ede4a731a9..47aa42f93ccda 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -232,12 +232,11 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(ip6_output); -bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +bool ip6_autoflowlabel(struct net *net, const struct sock *sk) { - if (!np->autoflowlabel_set) + if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) return ip6_default_np_autolabel(net); - else - return np->autoflowlabel; + return inet6_test_bit(AUTOFLOWLABEL, sk); } /* @@ -314,7 +313,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - ip6_autoflowlabel(net, np), fl6)); + ip6_autoflowlabel(net, sk), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -1938,7 +1937,6 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct in6_addr *final_dst; - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = v6_cork->opt; @@ -1981,7 +1979,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - ip6_autoflowlabel(net, np), fl6)); + ip6_autoflowlabel(net, sk), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 7a181831f226c..d5d428a695f72 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -474,6 +474,10 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet6_assign_bit(MC6_ALL, sk, valbool); return 0; + case IPV6_AUTOFLOWLABEL: + inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); + inet6_set_bit(AUTOFLOWLABEL_SET, sk); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -970,11 +974,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, np->dontfrag = valbool; retv = 0; break; - case IPV6_AUTOFLOWLABEL: - np->autoflowlabel = valbool; - np->autoflowlabel_set = 1; - retv = 0; - break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; @@ -1447,7 +1446,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_AUTOFLOWLABEL: - val = ip6_autoflowlabel(sock_net(sk), np); + val = ip6_autoflowlabel(sock_net(sk), sk); break; case IPV6_RECVFRAGSIZE: From 1086ca7cce292bb498d7f8f85f4593c9ef4902b7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:07 +0000 Subject: [PATCH 09/14] ipv6: lockless IPV6_DONTFRAG implementation Move np->dontfrag flag to inet->inet_flags to fix data-races. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 - include/net/inet_sock.h | 1 + include/net/ipv6.h | 6 +++--- include/net/xfrm.h | 2 +- net/ipv6/icmp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 9 ++++----- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- 11 files changed, 16 insertions(+), 17 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e3be5dc21b7d2..57d563f1d4b17 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -252,7 +252,6 @@ struct ipv6_pinfo { * 010: prefer public address * 100: prefer care-of address */ - dontfrag:1, rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index f1af64a406731..ac75324e9e1ea 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -273,6 +273,7 @@ enum { INET_FLAGS_MC6_ALL = 22, INET_FLAGS_AUTOFLOWLABEL_SET = 23, INET_FLAGS_AUTOFLOWLABEL = 24, + INET_FLAGS_DONTFRAG = 25, }; /* cmsg flags for inet */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4b6cbec059e25..5a1f2993680da 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -373,12 +373,12 @@ static inline void ipcm6_init(struct ipcm6_cookie *ipc6) } static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, - const struct ipv6_pinfo *np) + const struct sock *sk) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, - .tclass = np->tclass, - .dontfrag = np->dontfrag, + .tclass = inet6_sk(sk)->tclass, + .dontfrag = inet6_test_bit(DONTFRAG, sk), }; } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 363c7d5105542..98d7aa78addaa 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2166,7 +2166,7 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk) proto = sk->sk_protocol; if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) - return inet6_sk(sk)->dontfrag; + return inet6_test_bit(DONTFRAG, sk); return false; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 93a594a901d12..8fb4a791881a4 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -588,7 +588,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - ipcm6_init_sk(&ipc6, np); + ipcm6_init_sk(&ipc6, sk); ipc6.sockc.mark = mark; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); @@ -791,7 +791,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) msg.offset = 0; msg.type = type; - ipcm6_init_sk(&ipc6, np); + ipcm6_init_sk(&ipc6, sk); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); ipc6.sockc.mark = mark; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 47aa42f93ccda..8851fe5d45a07 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -2092,7 +2092,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, return ERR_PTR(err); } if (ipc6->dontfrag < 0) - ipc6->dontfrag = inet6_sk(sk)->dontfrag; + ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d5d428a695f72..33dd4dd872e6b 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -478,6 +478,9 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); inet6_set_bit(AUTOFLOWLABEL_SET, sk); return 0; + case IPV6_DONTFRAG: + inet6_assign_bit(DONTFRAG, sk, valbool); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -970,10 +973,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto e_inval; retv = __ip6_sock_set_addr_preferences(sk, val); break; - case IPV6_DONTFRAG: - np->dontfrag = valbool; - retv = 0; - break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; @@ -1442,7 +1441,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_DONTFRAG: - val = np->dontfrag; + val = inet6_test_bit(DONTFRAG, sk); break; case IPV6_AUTOFLOWLABEL: diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 5831aaa53d75e..4444b61eb23bb 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -118,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if)) return -EINVAL; - ipcm6_init_sk(&ipc6, np); + ipcm6_init_sk(&ipc6, sk); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 42fcec3ecf5e1..cc9673c1809fb 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -898,7 +898,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (ipc6.dontfrag < 0) - ipc6.dontfrag = np->dontfrag; + ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f60ba42954352..e4301500741a0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1595,7 +1595,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) do_append_data: if (ipc6.dontfrag < 0) - ipc6.dontfrag = np->dontfrag; + ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, fl6, (struct rt6_info *)dst, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index ed8ebb6f59097..40af2431e73aa 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -621,7 +621,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (ipc6.dontfrag < 0) - ipc6.dontfrag = np->dontfrag; + ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; From 3fa29971c69519629370b119b0b618ee88ade6b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:08 +0000 Subject: [PATCH 10/14] ipv6: lockless IPV6_RECVERR implemetation np->recverr is moved to inet->inet_flags to fix data-races. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 +-- include/net/inet_sock.h | 1 + include/net/ipv6.h | 4 +--- net/dccp/ipv6.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv6/datagram.c | 6 ++---- net/ipv6/ipv6_sockglue.c | 17 ++++++++--------- net/ipv6/raw.c | 10 +++++----- net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/udp.c | 6 +++--- net/sctp/ipv6.c | 4 +--- 11 files changed, 25 insertions(+), 32 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 57d563f1d4b17..53f4f1b97a787 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -243,8 +243,7 @@ struct ipv6_pinfo { } rxopt; /* sockopt flags */ - __u16 recverr:1, - sndflow:1, + __u16 sndflow:1, repflow:1, pmtudisc:3, padding:1, /* 1 bit hole */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index ac75324e9e1ea..3b79bc759ff47 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -274,6 +274,7 @@ enum { INET_FLAGS_AUTOFLOWLABEL_SET = 23, INET_FLAGS_AUTOFLOWLABEL = 24, INET_FLAGS_DONTFRAG = 25, + INET_FLAGS_RECVERR6 = 26, }; /* cmsg flags for inet */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5a1f2993680da..bd115980809f3 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1303,9 +1303,7 @@ static inline int ip6_sock_set_v6only(struct sock *sk) static inline void ip6_sock_set_recverr(struct sock *sk) { - lock_sock(sk); - inet6_sk(sk)->recverr = true; - release_sock(sk); + inet6_set_bit(RECVERR6, sk); } static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 83617a16b98e7..e6c3d84c2b9ec 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -185,7 +185,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - if (!sock_owned_by_user(sk) && np->recverr) { + if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { sk->sk_err = err; sk_error_report(sk); } else { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 75e0aee35eb78..bc01ad5fc01ab 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -581,7 +581,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) * 4.1.3.3. */ if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) || - (family == AF_INET6 && !inet6_sk(sk)->recverr)) { + (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e81892814935f..74673a5eff319 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -305,11 +305,10 @@ static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { - struct ipv6_pinfo *np = inet6_sk(sk); struct icmp6hdr *icmph = icmp6_hdr(skb); struct sock_exterr_skb *serr; - if (!np->recverr) + if (!inet6_test_bit(RECVERR6, sk)) return; skb = skb_clone(skb, GFP_ATOMIC); @@ -344,12 +343,11 @@ EXPORT_SYMBOL_GPL(ipv6_icmp_error); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { - const struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; - if (!np->recverr) + if (!inet6_test_bit(RECVERR6, sk)) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 33dd4dd872e6b..ec10b45c49c15 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -481,6 +481,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_DONTFRAG: inet6_assign_bit(DONTFRAG, sk, valbool); return 0; + case IPV6_RECVERR: + if (optlen < sizeof(int)) + return -EINVAL; + inet6_assign_bit(RECVERR6, sk, valbool); + if (!val) + skb_errqueue_purge(&sk->sk_error_queue); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -943,14 +950,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, np->pmtudisc = val; retv = 0; break; - case IPV6_RECVERR: - if (optlen < sizeof(int)) - goto e_inval; - np->recverr = valbool; - if (!val) - skb_errqueue_purge(&sk->sk_error_queue); - retv = 0; - break; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) goto e_inval; @@ -1380,7 +1379,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_RECVERR: - val = np->recverr; + val = inet6_test_bit(RECVERR6, sk); break; case IPV6_FLOWINFO_SEND: diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index cc9673c1809fb..71f6bdccfa1f3 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -291,6 +291,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { + bool recverr = inet6_test_bit(RECVERR6, sk); struct ipv6_pinfo *np = inet6_sk(sk); int err; int harderr; @@ -300,7 +301,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, 2. Socket is connected (otherwise the error indication is useless without recverr and error is hard. */ - if (!np->recverr && sk->sk_state != TCP_ESTABLISHED) + if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; harderr = icmpv6_err_convert(type, code, &err); @@ -312,14 +313,14 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, ip6_sk_redirect(skb, sk); return; } - if (np->recverr) { + if (recverr) { u8 *payload = skb->data; if (!inet_test_bit(HDRINCL, sk)) payload += offset; ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); } - if (np->recverr || harderr) { + if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } @@ -587,7 +588,6 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, unsigned int flags, const struct sockcm_cookie *sockc) { - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6hdr *iph; struct sk_buff *skb; @@ -668,7 +668,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); error_check: - if (err == -ENOBUFS && !np->recverr) + if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) err = 0; return err; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 54db5fab318bc..b5954b136b573 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -508,7 +508,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tcp_ld_RTO_revert(sk, seq); } - if (!sock_owned_by_user(sk) && np->recverr) { + if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e4301500741a0..90e873689b885 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -619,7 +619,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - if (!np->recverr) { + if (!inet6_test_bit(RECVERR6, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { @@ -1283,7 +1283,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, send: err = ip6_send_skb(skb); if (err) { - if (err == -ENOBUFS && !inet6_sk(sk)->recverr) { + if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) { UDP6_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; @@ -1608,7 +1608,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) up->pending = 0; if (err > 0) - err = np->recverr ? net_xmit_errno(err) : 0; + err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0; release_sock(sk); out: diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 43f2731bf590e..42b5b853ea01c 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -128,7 +128,6 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb, { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; - struct ipv6_pinfo *np; int err = 0; switch (type) { @@ -149,9 +148,8 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb, break; } - np = inet6_sk(sk); icmpv6_err_convert(type, code, &err); - if (!sock_owned_by_user(sk) && np->recverr) { + if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { sk->sk_err = err; sk_error_report(sk); } else { From 3cccda8db2cf2f2a224d55d5b6e2251d478c58ca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:09 +0000 Subject: [PATCH 11/14] ipv6: move np->repflow to atomic flags Move np->repflow to inet->inet_flags to fix data-races. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 - include/net/inet_sock.h | 1 + net/dccp/ipv6.c | 2 +- net/ipv6/af_inet6.c | 3 ++- net/ipv6/ip6_flowlabel.c | 8 ++++---- net/ipv6/tcp_ipv6.c | 14 ++++++-------- 6 files changed, 14 insertions(+), 15 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 53f4f1b97a787..e62413371ea40 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -244,7 +244,6 @@ struct ipv6_pinfo { /* sockopt flags */ __u16 sndflow:1, - repflow:1, pmtudisc:3, padding:1, /* 1 bit hole */ srcprefs:3, /* 001: prefer temporary address diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 3b79bc759ff47..5d61c7dc65778 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -275,6 +275,7 @@ enum { INET_FLAGS_AUTOFLOWLABEL = 24, INET_FLAGS_DONTFRAG = 25, INET_FLAGS_RECVERR6 = 26, + INET_FLAGS_REPFLOW = 27, }; /* cmsg flags for inet */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e6c3d84c2b9ec..d7e63eea705df 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -679,7 +679,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &DCCP_SKB_CB(opt_skb)->header.h6)) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 372fb7b9112c8..48737363377fe 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -220,7 +220,8 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, inet6_set_bit(MC6_LOOP, sk); inet6_set_bit(MC6_ALL, sk); np->pmtudisc = IPV6_PMTUDISC_WANT; - np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; + inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect & + FLOWLABEL_REFLECT_ESTABLISHED); sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index b3ca4beb4405a..eca07e10e21fc 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -513,7 +513,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, return 0; } - if (np->repflow) { + if (inet6_test_bit(REPFLOW, sk)) { freq->flr_label = np->flow_label; return 0; } @@ -551,10 +551,10 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; - if (!np->repflow) + if (!inet6_test_bit(REPFLOW, sk)) return -ESRCH; np->flow_label = 0; - np->repflow = 0; + inet6_clear_bit(REPFLOW, sk); return 0; } @@ -626,7 +626,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; - np->repflow = 1; + inet6_set_bit(REPFLOW, sk); return 0; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b5954b136b573..201caf88bb99e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -548,7 +548,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; - if (np->repflow && ireq->pktopts) + if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? @@ -797,7 +797,7 @@ static void tcp_v6_init_req(struct request_sock *req, (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || - np->rxopt.bits.rxohlim || np->repflow)) { + np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) { refcount_inc(&skb->users); ireq->pktopts = skb; } @@ -1055,10 +1055,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { - const struct ipv6_pinfo *np = tcp_inet6_sk(sk); - trace_tcp_send_reset(sk, skb); - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = sk->sk_priority; txhash = sk->sk_txhash; @@ -1247,7 +1245,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->mcast_oif = inet_iif(skb); newnp->mcast_hops = ip_hdr(skb)->ttl; newnp->rcv_flowinfo = 0; - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = 0; /* @@ -1320,7 +1318,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Set ToS of the new socket based upon the value of incoming SYN. @@ -1546,7 +1544,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { tcp_v6_restore_cb(opt_skb); From 83cd5eb654b320c1972254f243531f3f3cebcccf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:10 +0000 Subject: [PATCH 12/14] ipv6: lockless IPV6_ROUTER_ALERT_ISOLATE implementation Reads from np->rtalert_isolate are racy. Move this flag to inet->inet_flags to fix data-races. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 +-- include/net/inet_sock.h | 1 + net/ipv6/ip6_output.c | 3 +-- net/ipv6/ipv6_sockglue.c | 13 ++++++------- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e62413371ea40..f288a35f157f7 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -246,11 +246,10 @@ struct ipv6_pinfo { __u16 sndflow:1, pmtudisc:3, padding:1, /* 1 bit hole */ - srcprefs:3, /* 001: prefer temporary address + srcprefs:3; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ - rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 5d61c7dc65778..befee0f66c055 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -276,6 +276,7 @@ enum { INET_FLAGS_DONTFRAG = 25, INET_FLAGS_RECVERR6 = 26, INET_FLAGS_REPFLOW = 27, + INET_FLAGS_RTALERT_ISOLATE = 28, }; /* cmsg flags for inet */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8851fe5d45a07..f87d8491d7e27 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -368,9 +368,8 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel) if (sk && ra->sel == sel && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == skb->dev->ifindex)) { - struct ipv6_pinfo *np = inet6_sk(sk); - if (np && np->rtalert_isolate && + if (inet6_test_bit(RTALERT_ISOLATE, sk) && !net_eq(sock_net(sk), dev_net(skb->dev))) { continue; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index ec10b45c49c15..c22a492e05360 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -488,6 +488,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; + case IPV6_ROUTER_ALERT_ISOLATE: + if (optlen < sizeof(int)) + return -EINVAL; + inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -936,12 +941,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto e_inval; retv = ip6_ra_control(sk, val); break; - case IPV6_ROUTER_ALERT_ISOLATE: - if (optlen < sizeof(int)) - goto e_inval; - np->rtalert_isolate = valbool; - retv = 0; - break; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) goto e_inval; @@ -1452,7 +1451,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_ROUTER_ALERT_ISOLATE: - val = np->rtalert_isolate; + val = inet6_test_bit(RTALERT_ISOLATE, sk); break; case IPV6_RECVERR_RFC4884: From 6b724bc4300b431443f3b99520994a5aece347cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:11 +0000 Subject: [PATCH 13/14] ipv6: lockless IPV6_MTU_DISCOVER implementation Most np->pmtudisc reads are racy. Move this 3bit field on a full byte, add annotations and make IPV6_MTU_DISCOVER setsockopt() lockless. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 5 ++--- include/net/ip6_route.h | 14 +++++++++----- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/ipv6_sockglue.c | 17 ++++++++--------- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 2 +- 7 files changed, 24 insertions(+), 22 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index f288a35f157f7..10f521a6a9c8a 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -243,13 +243,12 @@ struct ipv6_pinfo { } rxopt; /* sockopt flags */ - __u16 sndflow:1, - pmtudisc:3, - padding:1, /* 1 bit hole */ + __u8 sndflow:1, srcprefs:3; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ + __u8 pmtudisc; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index b32539bb0fb05..b1ea49900b4ae 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -266,7 +266,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) const struct dst_entry *dst = skb_dst(skb); unsigned int mtu; - if (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) { + if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) { mtu = READ_ONCE(dst->dev->mtu); mtu -= lwtunnel_headroom(dst->lwtstate, mtu); } else { @@ -277,14 +277,18 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) static inline bool ip6_sk_accept_pmtu(const struct sock *sk) { - return inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_INTERFACE && - inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc); + + return pmtudisc != IPV6_PMTUDISC_INTERFACE && + pmtudisc != IPV6_PMTUDISC_OMIT; } static inline bool ip6_sk_ignore_df(const struct sock *sk) { - return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO || - inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc); + + return pmtudisc < IPV6_PMTUDISC_DO || + pmtudisc == IPV6_PMTUDISC_OMIT; } static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f87d8491d7e27..7e5d9eeb990fd 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1436,10 +1436,10 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) - mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? + mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else - mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? + mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); frag_size = READ_ONCE(np->frag_size); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c22a492e05360..85ea42644dcbb 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -493,6 +493,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); return 0; + case IPV6_MTU_DISCOVER: + if (optlen < sizeof(int)) + return -EINVAL; + if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) + return -EINVAL; + WRITE_ONCE(np->pmtudisc, val); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -941,14 +948,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto e_inval; retv = ip6_ra_control(sk, val); break; - case IPV6_MTU_DISCOVER: - if (optlen < sizeof(int)) - goto e_inval; - if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) - goto e_inval; - np->pmtudisc = val; - retv = 0; - break; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) goto e_inval; @@ -1374,7 +1373,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_MTU_DISCOVER: - val = np->pmtudisc; + val = READ_ONCE(np->pmtudisc); break; case IPV6_RECVERR: diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 71f6bdccfa1f3..47372cceb98f6 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -307,7 +307,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, harderr = icmpv6_err_convert(type, code, &err); if (type == ICMPV6_PKT_TOOBIG) { ip6_sk_update_pmtu(skb, sk, info); - harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); + harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO); } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 90e873689b885..c17e19fece1b8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -598,7 +598,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!ip6_sk_accept_pmtu(sk)) goto out; ip6_sk_update_pmtu(skb, sk, info); - if (np->pmtudisc != IPV6_PMTUDISC_DONT) + if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT) harderr = 1; } if (type == NDISC_REDIRECT) { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index df1b33b61059e..5820a8156c470 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1341,7 +1341,7 @@ static void set_mcast_pmtudisc(struct sock *sk, int val) struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MTU_DISCOVER */ - np->pmtudisc = val; + WRITE_ONCE(np->pmtudisc, val); } #endif release_sock(sk); From 859f8b265fc2a11af0fb0c52b4087e0409250592 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 16:02:12 +0000 Subject: [PATCH 14/14] ipv6: lockless IPV6_FLOWINFO_SEND implementation np->sndflow reads are racy. Use one bit ftom atomic inet->inet_flags instead, IPV6_FLOWINFO_SEND setsockopt() can be lockless. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 +-- include/net/inet_sock.h | 1 + net/dccp/ipv6.c | 2 +- net/ipv4/ping.c | 3 +-- net/ipv6/af_inet6.c | 2 +- net/ipv6/datagram.c | 7 ++++--- net/ipv6/ipv6_sockglue.c | 13 ++++++------- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/udp.c | 2 +- net/l2tp/l2tp_ip6.c | 4 ++-- net/sctp/ipv6.c | 3 ++- 13 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 10f521a6a9c8a..09253825c99c7 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -243,8 +243,7 @@ struct ipv6_pinfo { } rxopt; /* sockopt flags */ - __u8 sndflow:1, - srcprefs:3; /* 001: prefer temporary address + __u8 srcprefs:3; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index befee0f66c055..98e11958cdff6 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -277,6 +277,7 @@ enum { INET_FLAGS_RECVERR6 = 26, INET_FLAGS_REPFLOW = 27, INET_FLAGS_RTALERT_ISOLATE = 28, + INET_FLAGS_SNDFLOW = 29, }; /* cmsg flags for inet */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d7e63eea705df..4803f06148488 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -844,7 +844,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, memset(&fl6, 0, sizeof(fl6)); - if (np->sndflow) { + if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index bc01ad5fc01ab..4dd809b7b1886 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -899,7 +899,6 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *ip6 = ipv6_hdr(skb); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); @@ -908,7 +907,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, sin6->sin6_port = 0; sin6->sin6_addr = ip6->saddr; sin6->sin6_flowinfo = 0; - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) sin6->sin6_flowinfo = ip6_flowinfo(ip6); sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 48737363377fe..c6ad0d6e99b5e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -537,7 +537,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, } sin->sin6_port = inet->inet_dport; sin->sin6_addr = sk->sk_v6_daddr; - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, CGROUP_INET6_GETPEERNAME); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 74673a5eff319..cc6a502db39d2 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -80,7 +80,8 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) struct flowi6 fl6; int err = 0; - if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) { + if (inet6_test_bit(SNDFLOW, sk) && + (np->flow_label & IPV6_FLOWLABEL_MASK)) { flowlabel = fl6_sock_lookup(sk, np->flow_label); if (IS_ERR(flowlabel)) return -EINVAL; @@ -163,7 +164,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (ipv6_addr_any(&usin->sin6_addr)) { @@ -491,7 +492,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), struct ipv6hdr, daddr); sin->sin6_addr = ip6h->daddr; - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = ip6_flowinfo(ip6h); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 85ea42644dcbb..e9dc6f881bb92 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -500,6 +500,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; WRITE_ONCE(np->pmtudisc, val); return 0; + case IPV6_FLOWINFO_SEND: + if (optlen < sizeof(int)) + return -EINVAL; + inet6_assign_bit(SNDFLOW, sk, valbool); + return 0; } if (needs_rtnl) rtnl_lock(); @@ -948,12 +953,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto e_inval; retv = ip6_ra_control(sk, val); break; - case IPV6_FLOWINFO_SEND: - if (optlen < sizeof(int)) - goto e_inval; - np->sndflow = valbool; - retv = 0; - break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; @@ -1381,7 +1380,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_FLOWINFO_SEND: - val = np->sndflow; + val = inet6_test_bit(SNDFLOW, sk); break; case IPV6_FLOWLABEL_MGR: diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 4444b61eb23bb..e8fb0d275cc2d 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -89,7 +89,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EAFNOSUPPORT; } daddr = &(u->sin6_addr); - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) oif = u->sin6_scope_id; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 47372cceb98f6..a2aa54a2baaec 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -795,7 +795,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; daddr = &sin6->sin6_addr; - if (np->sndflow) { + if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 201caf88bb99e..94afb8d0f2d0e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -163,7 +163,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, memset(&fl6, 0, sizeof(fl6)); - if (np->sndflow) { + if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c17e19fece1b8..5e9312eefed06 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1429,7 +1429,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6->fl6_dport = sin6->sin6_port; daddr = &sin6->sin6_addr; - if (np->sndflow) { + if (inet6_test_bit(SNDFLOW, sk)) { fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 40af2431e73aa..44cfb72bbd18a 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -431,7 +431,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr = sk->sk_v6_daddr; - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) lsa->l2tp_flowinfo = np->flow_label; } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) @@ -529,7 +529,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EAFNOSUPPORT; daddr = &lsa->l2tp_addr; - if (np->sndflow) { + if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 42b5b853ea01c..5c0ed5909d85a 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -296,7 +296,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); - if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) { + if (inet6_test_bit(SNDFLOW, sk) && + (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);