From 54580ccdd8a9c6821fd6f72171d435480867e4c3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 6 Mar 2025 22:34:08 -0500 Subject: [PATCH 1/3] ipv6: remove leftover ip6 cookie initializer As of the blamed commit ipc6.dontfrag is always initialized at the start of udpv6_sendmsg, by ipcm6_init_sk, to either 0 or 1. Later checks against -1 are no longer needed and the branches are now dead code. The blamed commit had removed those branches. But I had overlooked this one case. UDP has both a lockless fast path and a slower path for corked requests. This branch remained in the fast path. Fixes: 096208592b09 ("ipv6: replace ipcm6_init calls with ipcm6_init_sk") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250307033620.411611-2-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d577bf2f30538..d91da522c34e7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -2054,8 +2054,6 @@ struct sk_buff *ip6_make_skb(struct sock *sk, ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } - if (ipc6->dontfrag < 0) - ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, From a18dfa9925b9ef6107ea3aa5814ca3c704d34a8a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 6 Mar 2025 22:34:09 -0500 Subject: [PATCH 2/3] ipv6: save dontfrag in cork When spanning datagram construction over multiple send calls using MSG_MORE, per datagram settings are configured on the first send. That is when ip(6)_setup_cork stores these settings for subsequent use in __ip(6)_append_data and others. The only flag that escaped this was dontfrag. As a result, a datagram could be constructed with df=0 on the first sendmsg, but df=1 on a next. Which is what cmsg_ip.sh does in an upcoming MSG_MORE test in the "diff" scenario. Changing datagram conditions in the middle of constructing an skb makes this already complex code path even more convoluted. It is here unintentional. Bring this flag in line with expected sockopt/cmsg behavior. And stop passing ipc6 to __ip6_append_data, to avoid such issues in the future. This is already the case for __ip_append_data. inet6_cork had a 6 byte hole, so the 1B flag has no impact. Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250307033620.411611-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 1 + net/ipv6/ip6_output.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a6e2aadbb91bd..5aeeed22f35bf 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -207,6 +207,7 @@ struct inet6_cork { struct ipv6_txoptions *opt; u8 hop_limit; u8 tclass; + u8 dontfrag:1; }; /* struct ipv6_pinfo - ipv6 private area */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d91da522c34e7..581bc62890818 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1386,6 +1386,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; + v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); @@ -1421,7 +1422,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6) + unsigned int flags) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; @@ -1475,7 +1476,7 @@ static int __ip6_append_data(struct sock *sk, if (headersize + transhdrlen > mtu) goto emsgsize; - if (cork->length + length > mtu - headersize && ipc6->dontfrag && + if (cork->length + length > mtu - headersize && v6_cork->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { @@ -1855,7 +1856,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6); + from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -2058,7 +2059,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6); + flags); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); From 0922cb68edfde9e3920bb3aedea203d333af9f10 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 6 Mar 2025 22:34:10 -0500 Subject: [PATCH 3/3] selftests/net: expand cmsg_ip with MSG_MORE UDP send with MSG_MORE takes a slightly different path than the lockless fast path. For completeness, add coverage to this case too. Pass MSG_MORE on the initial sendmsg, then follow up with a zero byte write to unplug the cork. Unrelated: also add two missing endlines in usage(). Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250307033620.411611-4-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/cmsg_ip.sh | 11 +++++++---- tools/testing/selftests/net/cmsg_sender.c | 24 ++++++++++++++++++----- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/net/cmsg_ip.sh b/tools/testing/selftests/net/cmsg_ip.sh index 2a52520aca327..b55680e081ad7 100755 --- a/tools/testing/selftests/net/cmsg_ip.sh +++ b/tools/testing/selftests/net/cmsg_ip.sh @@ -50,8 +50,9 @@ check_result() { # IPV6_DONTFRAG for ovr in setsock cmsg both diff; do for df in 0 1; do - for p in u i r; do + for p in u U i r; do [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP [ $p == "i" ] && prot=ICMP [ $p == "r" ] && prot=RAW @@ -81,8 +82,9 @@ test_dscp() { ip $IPVER -netns $NS route add table 300 prohibit any for ovr in setsock cmsg both diff; do - for p in u i r; do + for p in u U i r; do [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP [ $p == "i" ] && prot=ICMP [ $p == "r" ] && prot=RAW @@ -134,8 +136,9 @@ test_ttl_hoplimit() { local -r LIM=4 for ovr in setsock cmsg both diff; do - for p in u i r; do + for p in u U i r; do [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP [ $p == "i" ] && prot=ICMP [ $p == "r" ] && prot=RAW @@ -166,7 +169,7 @@ test_ttl_hoplimit -4 $TGT4 ttl test_ttl_hoplimit -6 $TGT6 hlim # IPV6 exthdr -for p in u i r; do +for p in u U i r; do # Very basic "does it crash" test for h in h d r; do $NSEXE ./cmsg_sender -p $p -6 -H $h $TGT6 1234 diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index 19bd8499031bf..a825e628aee73 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -33,6 +33,7 @@ enum { ERN_RECVERR, ERN_CMSG_RD, ERN_CMSG_RCV, + ERN_SEND_MORE, }; struct option_cmsg_u32 { @@ -46,6 +47,7 @@ struct options { const char *service; unsigned int size; unsigned int num_pkt; + bool msg_more; struct { unsigned int mark; unsigned int dontfrag; @@ -94,7 +96,8 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\t\t-S send() size\n" "\t\t-4/-6 Force IPv4 / IPv6 only\n" "\t\t-p prot Socket protocol\n" - "\t\t (u = UDP (default); i = ICMP; r = RAW)\n" + "\t\t (u = UDP (default); i = ICMP; r = RAW;\n" + "\t\t U = UDP with MSG_MORE)\n" "\n" "\t\t-m val Set SO_MARK with given value\n" "\t\t-M val Set SO_MARK via setsockopt\n" @@ -109,8 +112,8 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\t\t-l val Set TTL/HOPLIMIT via cmsg\n" "\t\t-L val Set TTL/HOPLIMIT via setsockopt\n" "\t\t-H type Add an IPv6 header option\n" - "\t\t (h = HOP; d = DST; r = RTDST)" - ""); + "\t\t (h = HOP; d = DST; r = RTDST)\n" + "\n"); exit(ERN_HELP); } @@ -133,8 +136,11 @@ static void cs_parse_args(int argc, char *argv[]) opt.sock.family = AF_INET6; break; case 'p': - if (*optarg == 'u' || *optarg == 'U') { + if (*optarg == 'u') { opt.sock.proto = IPPROTO_UDP; + } else if (*optarg == 'U') { + opt.sock.proto = IPPROTO_UDP; + opt.msg_more = true; } else if (*optarg == 'i' || *optarg == 'I') { opt.sock.proto = IPPROTO_ICMP; } else if (*optarg == 'r') { @@ -531,7 +537,7 @@ int main(int argc, char *argv[]) cs_write_cmsg(fd, &msg, cbuf, sizeof(cbuf)); for (i = 0; i < opt.num_pkt; i++) { - err = sendmsg(fd, &msg, 0); + err = sendmsg(fd, &msg, opt.msg_more ? MSG_MORE : 0); if (err < 0) { if (!opt.silent_send) fprintf(stderr, "send failed: %s\n", strerror(errno)); @@ -542,6 +548,14 @@ int main(int argc, char *argv[]) err = ERN_SEND_SHORT; goto err_out; } + if (opt.msg_more) { + err = write(fd, NULL, 0); + if (err < 0) { + fprintf(stderr, "send more: %s\n", strerror(errno)); + err = ERN_SEND_MORE; + goto err_out; + } + } } err = ERN_SUCCESS;