From afcaf61be9d1dbdee5ec186d1dcc67b6b692180f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 10 Apr 2020 17:06:01 +0800 Subject: [PATCH 01/77] xfrm: allow to accept packets with ipv6 NEXTHDR_HOP in xfrm_input For beet mode, when it's ipv6 inner address with nexthdrs set, the packet format might be: ---------------------------------------------------- | outer | | dest | | | ESP | ESP | | IP hdr | ESP | opts.| TCP | Data | Trailer | ICV | ---------------------------------------------------- The nexthdr from ESP could be NEXTHDR_HOP(0), so it should continue processing the packet when nexthdr returns 0 in xfrm_input(). Otherwise, when ipv6 nexthdr is set, the packet will be dropped. I don't see any error cases that nexthdr may return 0. So fix it by removing the check for nexthdr == 0. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index aa35f23c49129..8a202c44f89ae 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -644,7 +644,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) dev_put(skb->dev); spin_lock(&x->lock); - if (nexthdr <= 0) { + if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, x->type->proto); From 06a0afcfe2f551ff755849ea2549b0d8409fd9a0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 10 Apr 2020 17:06:31 +0800 Subject: [PATCH 02/77] xfrm: do pskb_pull properly in __xfrm_transport_prep For transport mode, when ipv6 nexthdr is set, the packet format might be like: ---------------------------------------------------- | | dest | | | | ESP | ESP | | IP6 hdr| opts.| ESP | TCP | Data | Trailer | ICV | ---------------------------------------------------- and in __xfrm_transport_prep(): pskb_pull(skb, skb->mac_len + sizeof(ip6hdr) + x->props.header_len); it will pull the data pointer to the wrong position, as it missed the nexthdrs/dest opts. This patch is to fix it by using: pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len); as we can be sure transport_header points to ESP header at that moment. It also fixes a panic when packets with ipv6 nexthdr are sent over esp6 transport mode: [ 100.473845] kernel BUG at net/core/skbuff.c:4325! [ 100.478517] RIP: 0010:__skb_to_sgvec+0x252/0x260 [ 100.494355] Call Trace: [ 100.494829] skb_to_sgvec+0x11/0x40 [ 100.495492] esp6_output_tail+0x12e/0x550 [esp6] [ 100.496358] esp6_xmit+0x1d5/0x260 [esp6_offload] [ 100.498029] validate_xmit_xfrm+0x22f/0x2e0 [ 100.499604] __dev_queue_xmit+0x589/0x910 [ 100.502928] ip6_finish_output2+0x2a5/0x5a0 [ 100.503718] ip6_output+0x6c/0x120 [ 100.505198] xfrm_output_resume+0x4bf/0x530 [ 100.508683] xfrm6_output+0x3a/0xc0 [ 100.513446] inet6_csk_xmit+0xa1/0xf0 [ 100.517335] tcp_sendmsg+0x27/0x40 [ 100.517977] sock_sendmsg+0x3e/0x60 [ 100.518648] __sys_sendto+0xee/0x160 Fixes: c35fe4106b92 ("xfrm: Add mode handlers for IPsec on layer 2") Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 6cc7f7f1dd68c..f50d1f97cf8ec 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -25,12 +25,10 @@ static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb, struct xfrm_offload *xo = xfrm_offload(skb); skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + hsize + x->props.header_len); - - if (xo->flags & XFRM_GSO_SEGMENT) { - skb_reset_transport_header(skb); + if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header -= x->props.header_len; - } + + pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len); } static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, From 3c96ec56828922e3fe5477f75eb3fc02f98f98b5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 10 Apr 2020 17:06:56 +0800 Subject: [PATCH 03/77] esp6: get the right proto for transport mode in esp6_gso_encap For transport mode, when ipv6 nexthdr is set, the packet format might be like: ---------------------------------------------------- | | dest | | | | ESP | ESP | | IP6 hdr| opts.| ESP | TCP | Data | Trailer | ICV | ---------------------------------------------------- What it wants to get for x-proto in esp6_gso_encap() is the proto that will be set in ESP nexthdr. So it should skip all ipv6 nexthdrs and get the real transport protocol. Othersize, the wrong proto number will be set into ESP nexthdr. This patch is to skip all ipv6 nexthdrs by calling ipv6_skip_exthdr() in esp6_gso_encap(). Fixes: 7862b4058b9f ("esp: Add gso handlers for esp4 and esp6") Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/ipv6/esp6_offload.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 8eab2c869d615..b828508865743 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -123,9 +123,16 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) struct ip_esp_hdr *esph; struct ipv6hdr *iph = ipv6_hdr(skb); struct xfrm_offload *xo = xfrm_offload(skb); - int proto = iph->nexthdr; + u8 proto = iph->nexthdr; skb_push(skb, -skb_network_offset(skb)); + + if (x->outer_mode.encap == XFRM_MODE_TRANSPORT) { + __be16 frag; + + ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag); + } + esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; From db87668ad1e4917cfe04e217307ba6ed9390716e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 10 Apr 2020 17:08:24 +0800 Subject: [PATCH 04/77] xfrm: remove the xfrm_state_put call becofe going to out_reset This xfrm_state_put call in esp4/6_gro_receive() will cause double put for state, as in out_reset path secpath_reset() will put all states set in skb sec_path. So fix it by simply remove the xfrm_state_put call. Fixes: 6ed69184ed9c ("xfrm: Reset secpath in xfrm failure") Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 4 +--- net/ipv6/esp6_offload.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 731022cff6006..231edcb84c081 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -63,10 +63,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, sp->olen++; xo = xfrm_offload(skb); - if (!xo) { - xfrm_state_put(x); + if (!xo) goto out_reset; - } } xo->flags |= XFRM_GRO; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index b828508865743..b92372b104ba9 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -85,10 +85,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, sp->olen++; xo = xfrm_offload(skb); - if (!xo) { - xfrm_state_put(x); + if (!xo) goto out_reset; - } } xo->flags |= XFRM_GRO; From 29e4276667e24ee6b91d9f91064d8fda9a210ea1 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Wed, 15 Apr 2020 21:47:10 +0200 Subject: [PATCH 05/77] xfrm: fix error in comment s/xfrm_state_offload/xfrm_user_offload/ Fixes: d77e38e612a ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 5f3b9fec7b5f4..ff7cfdc6cb44d 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -304,7 +304,7 @@ enum xfrm_attr_type_t { XFRMA_PROTO, /* __u8 */ XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, - XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ + XFRMA_OFFLOAD_DEV, /* struct xfrm_user_offload */ XFRMA_SET_MARK, /* __u32 */ XFRMA_SET_MARK_MASK, /* __u32 */ XFRMA_IF_ID, /* __u32 */ From 9f0cadc32d738f0f0c8e30be83be7087c7b85ee5 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 16 Apr 2020 17:45:44 +0200 Subject: [PATCH 06/77] xfrm: espintcp: save and call old ->sk_destruct When ESP encapsulation is enabled on a TCP socket, I'm replacing the existing ->sk_destruct callback with espintcp_destruct. We still need to call the old callback to perform the other cleanups when the socket is destroyed. Save the old callback, and call it from espintcp_destruct. Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/espintcp.h | 1 + net/xfrm/espintcp.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/net/espintcp.h b/include/net/espintcp.h index dd7026a000660..0335bbd76552a 100644 --- a/include/net/espintcp.h +++ b/include/net/espintcp.h @@ -25,6 +25,7 @@ struct espintcp_ctx { struct espintcp_msg partial; void (*saved_data_ready)(struct sock *sk); void (*saved_write_space)(struct sock *sk); + void (*saved_destruct)(struct sock *sk); struct work_struct work; bool tx_running; }; diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 037ea156d2f93..5a0ff665b71a8 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -379,6 +379,7 @@ static void espintcp_destruct(struct sock *sk) { struct espintcp_ctx *ctx = espintcp_getctx(sk); + ctx->saved_destruct(sk); kfree(ctx); } @@ -419,6 +420,7 @@ static int espintcp_init_sk(struct sock *sk) sk->sk_socket->ops = &espintcp_ops; ctx->saved_data_ready = sk->sk_data_ready; ctx->saved_write_space = sk->sk_write_space; + ctx->saved_destruct = sk->sk_destruct; sk->sk_data_ready = espintcp_data_ready; sk->sk_write_space = espintcp_write_space; sk->sk_destruct = espintcp_destruct; From 25a44ae93d1a490f36d88a180f11aa2650bef074 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 19 Apr 2020 16:10:00 +0800 Subject: [PATCH 07/77] esp6: support ipv6 nexthdrs process for beet gso segment For beet mode, when it's ipv6 inner address with nexthdrs set, the packet format might be: ---------------------------------------------------- | outer | | dest | | | ESP | ESP | | IP6 hdr| ESP | opts.| TCP | Data | Trailer | ICV | ---------------------------------------------------- Before doing gso segment in xfrm6_beet_gso_segment(), it should skip all nexthdrs and get the real transport proto, and set transport_header properly. This patch is to fix it by simply calling ipv6_skip_exthdr() in xfrm6_beet_gso_segment(). v1->v2: - remove skb_transport_offset(), as it will always return 0 in xfrm6_beet_gso_segment(), thank Sabrina's check. Fixes: 7f9e40eb18a9 ("esp6: add gso_segment for esp6 beet mode") Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/ipv6/esp6_offload.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index b92372b104ba9..9c03460b2760a 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -171,7 +171,7 @@ static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; - int proto = xo->proto; + u8 proto = xo->proto; skb->transport_header += x->props.header_len; @@ -182,7 +182,12 @@ static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, proto = ph->nexthdr; } - if (x->sel.family != AF_INET6) { + if (x->sel.family == AF_INET6) { + __be16 frag; + + skb->transport_header += + ipv6_skip_exthdr(skb, 0, &proto, &frag); + } else { skb->transport_header -= (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); From 6f297068a0696eafecef13852408b05a81edb560 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 19 Apr 2020 16:11:02 +0800 Subject: [PATCH 08/77] esp4: support ipv6 nexthdrs process for beet gso segment For beet mode, when it's ipv6 inner address with nexthdrs set, the packet format might be: ---------------------------------------------------- | outer | | dest | | | ESP | ESP | | IP hdr | ESP | opts.| TCP | Data | Trailer | ICV | ---------------------------------------------------- Before doing gso segment in xfrm4_beet_gso_segment(), the same thing is needed as it does in xfrm6_beet_gso_segment() in last patch 'esp6: support ipv6 nexthdrs process for beet gso segment'. v1->v2: - remove skb_transport_offset(), as it will always return 0 in xfrm6_beet_gso_segment(), thank Sabrina's check. Fixes: 384a46ea7bdc ("esp4: add gso_segment for esp4 beet mode") Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 231edcb84c081..9b1d451edae0b 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -137,7 +137,7 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; - int proto = xo->proto; + u8 proto = xo->proto; skb->transport_header += x->props.header_len; @@ -146,10 +146,15 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, skb->transport_header += ph->hdrlen * 8; proto = ph->nexthdr; - } else if (x->sel.family != AF_INET6) { + } else if (x->sel.family == AF_INET6) { + __be16 frag; + + skb->transport_header += + ipv6_skip_exthdr(skb, 0, &proto, &frag); + if (proto == IPPROTO_TCP) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + } else { skb->transport_header -= IPV4_BEET_PHMAXLEN; - } else if (proto == IPPROTO_TCP) { - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } __skb_pull(skb, skb_transport_offset(skb)); From a204aef9fd77dce1efd9066ca4e44eede99cd858 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 20 Apr 2020 21:51:09 +0800 Subject: [PATCH 09/77] xfrm: call xfrm_output_gso when inner_protocol is set in xfrm_output An use-after-free crash can be triggered when sending big packets over vxlan over esp with esp offload enabled: [] BUG: KASAN: use-after-free in ipv6_gso_pull_exthdrs.part.8+0x32c/0x4e0 [] Call Trace: [] dump_stack+0x75/0xa0 [] kasan_report+0x37/0x50 [] ipv6_gso_pull_exthdrs.part.8+0x32c/0x4e0 [] ipv6_gso_segment+0x2c8/0x13c0 [] skb_mac_gso_segment+0x1cb/0x420 [] skb_udp_tunnel_segment+0x6b5/0x1c90 [] inet_gso_segment+0x440/0x1380 [] skb_mac_gso_segment+0x1cb/0x420 [] esp4_gso_segment+0xae8/0x1709 [esp4_offload] [] inet_gso_segment+0x440/0x1380 [] skb_mac_gso_segment+0x1cb/0x420 [] __skb_gso_segment+0x2d7/0x5f0 [] validate_xmit_skb+0x527/0xb10 [] __dev_queue_xmit+0x10f8/0x2320 <--- [] ip_finish_output2+0xa2e/0x1b50 [] ip_output+0x1a8/0x2f0 [] xfrm_output_resume+0x110e/0x15f0 [] __xfrm4_output+0xe1/0x1b0 [] xfrm4_output+0xa0/0x200 [] iptunnel_xmit+0x5a7/0x920 [] vxlan_xmit_one+0x1658/0x37a0 [vxlan] [] vxlan_xmit+0x5e4/0x3ec8 [vxlan] [] dev_hard_start_xmit+0x125/0x540 [] __dev_queue_xmit+0x17bd/0x2320 <--- [] ip6_finish_output2+0xb20/0x1b80 [] ip6_output+0x1b3/0x390 [] ip6_xmit+0xb82/0x17e0 [] inet6_csk_xmit+0x225/0x3d0 [] __tcp_transmit_skb+0x1763/0x3520 [] tcp_write_xmit+0xd64/0x5fe0 [] __tcp_push_pending_frames+0x8c/0x320 [] tcp_sendmsg_locked+0x2245/0x3500 [] tcp_sendmsg+0x27/0x40 As on the tx path of vxlan over esp, skb->inner_network_header would be set on vxlan_xmit() and xfrm4_tunnel_encap_add(), and the later one can overwrite the former one. It causes skb_udp_tunnel_segment() to use a wrong skb->inner_network_header, then the issue occurs. This patch is to fix it by calling xfrm_output_gso() instead when the inner_protocol is set, in which gso_segment of inner_protocol will be done first. While at it, also improve some code around. Fixes: 7862b4058b9f ("esp: Add gso handlers for esp4 and esp6") Reported-by: Xiumei Mu Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 2fd3d990d992f..69c33cab8f49a 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -583,18 +583,20 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) xfrm_state_hold(x); if (skb_is_gso(skb)) { - skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; + if (skb->inner_protocol) + return xfrm_output_gso(net, sk, skb); - return xfrm_output2(net, sk, skb); + skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; + goto out; } if (x->xso.dev && x->xso.dev->features & NETIF_F_HW_ESP_TX_CSUM) goto out; + } else { + if (skb_is_gso(skb)) + return xfrm_output_gso(net, sk, skb); } - if (skb_is_gso(skb)) - return xfrm_output_gso(net, sk, skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) { err = skb_checksum_help(skb); if (err) { From 976eba8ab596bab94b9714cd46d38d5c6a2c660d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 21 Apr 2020 20:46:11 +0800 Subject: [PATCH 10/77] ip_vti: receive ipip packet by calling ip_tunnel_rcv In Commit dd9ee3444014 ("vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel"), it tries to receive IPIP packets in vti by calling xfrm_input(). This case happens when a small packet or frag sent by peer is too small to get compressed. However, xfrm_input() will still get to the IPCOMP path where skb sec_path is set, but never dropped while it should have been done in vti_ipcomp4_protocol.cb_handler(vti_rcv_cb), as it's not an ipcomp4 packet. This will cause that the packet can never pass xfrm4_policy_check() in the upper protocol rcv functions. So this patch is to call ip_tunnel_rcv() to process IPIP packets instead. Fixes: dd9ee3444014 ("vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel") Reported-by: Xiumei Mu Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1b4e6f298648d..1dda7c155c484 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -93,7 +93,28 @@ static int vti_rcv_proto(struct sk_buff *skb) static int vti_rcv_tunnel(struct sk_buff *skb) { - return vti_rcv(skb, ip_hdr(skb)->saddr, true); + struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id); + const struct iphdr *iph = ip_hdr(skb); + struct ip_tunnel *tunnel; + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + if (tunnel) { + struct tnl_ptk_info tpi = { + .proto = htons(ETH_P_IP), + }; + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) + goto drop; + return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; } static int vti_rcv_cb(struct sk_buff *skb, int err) From c95c5f58b35ef995f66cb55547eee6093ab5fcb8 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 23 Apr 2020 00:06:45 +0200 Subject: [PATCH 11/77] xfrm interface: fix oops when deleting a x-netns interface Here is the steps to reproduce the problem: ip netns add foo ip netns add bar ip -n foo link add xfrmi0 type xfrm dev lo if_id 42 ip -n foo link set xfrmi0 netns bar ip netns del foo ip netns del bar Which results to: [ 186.686395] general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6bd3: 0000 [#1] SMP PTI [ 186.687665] CPU: 7 PID: 232 Comm: kworker/u16:2 Not tainted 5.6.0+ #1 [ 186.688430] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 [ 186.689420] Workqueue: netns cleanup_net [ 186.689903] RIP: 0010:xfrmi_dev_uninit+0x1b/0x4b [xfrm_interface] [ 186.690657] Code: 44 f6 ff ff 31 c0 5b 5d 41 5c 41 5d 41 5e c3 48 8d 8f c0 08 00 00 8b 05 ce 14 00 00 48 8b 97 d0 08 00 00 48 8b 92 c0 0e 00 00 <48> 8b 14 c2 48 8b 02 48 85 c0 74 19 48 39 c1 75 0c 48 8b 87 c0 08 [ 186.692838] RSP: 0018:ffffc900003b7d68 EFLAGS: 00010286 [ 186.693435] RAX: 000000000000000d RBX: ffff8881b0f31000 RCX: ffff8881b0f318c0 [ 186.694334] RDX: 6b6b6b6b6b6b6b6b RSI: 0000000000000246 RDI: ffff8881b0f31000 [ 186.695190] RBP: ffffc900003b7df0 R08: ffff888236c07740 R09: 0000000000000040 [ 186.696024] R10: ffffffff81fce1b8 R11: 0000000000000002 R12: ffffc900003b7d80 [ 186.696859] R13: ffff8881edcc6a40 R14: ffff8881a1b6e780 R15: ffffffff81ed47c8 [ 186.697738] FS: 0000000000000000(0000) GS:ffff888237dc0000(0000) knlGS:0000000000000000 [ 186.698705] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 186.699408] CR2: 00007f2129e93148 CR3: 0000000001e0a000 CR4: 00000000000006e0 [ 186.700221] Call Trace: [ 186.700508] rollback_registered_many+0x32b/0x3fd [ 186.701058] ? __rtnl_unlock+0x20/0x3d [ 186.701494] ? arch_local_irq_save+0x11/0x17 [ 186.702012] unregister_netdevice_many+0x12/0x55 [ 186.702594] default_device_exit_batch+0x12b/0x150 [ 186.703160] ? prepare_to_wait_exclusive+0x60/0x60 [ 186.703719] cleanup_net+0x17d/0x234 [ 186.704138] process_one_work+0x196/0x2e8 [ 186.704652] worker_thread+0x1a4/0x249 [ 186.705087] ? cancel_delayed_work+0x92/0x92 [ 186.705620] kthread+0x105/0x10f [ 186.706000] ? __kthread_bind_mask+0x57/0x57 [ 186.706501] ret_from_fork+0x35/0x40 [ 186.706978] Modules linked in: xfrm_interface nfsv3 nfs_acl auth_rpcgss nfsv4 nfs lockd grace fscache sunrpc button parport_pc parport serio_raw evdev pcspkr loop ext4 crc16 mbcache jbd2 crc32c_generic 8139too ide_cd_mod cdrom ide_gd_mod ata_generic ata_piix libata scsi_mod piix psmouse i2c_piix4 ide_core 8139cp i2c_core mii floppy [ 186.710423] ---[ end trace 463bba18105537e5 ]--- The problem is that x-netns xfrm interface are not removed when the link netns is removed. This causes later this oops when thoses interfaces are removed. Let's add a handler to remove all interfaces related to a netns when this netns is removed. Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Reported-by: Christophe Gouault Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 3361e3ac5714c..1e115cbf21d3b 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -750,7 +750,28 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; +static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) +{ + struct net *net; + LIST_HEAD(list); + + rtnl_lock(); + list_for_each_entry(net, net_exit_list, exit_list) { + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + + for (xip = &xfrmn->xfrmi[0]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) + unregister_netdevice_queue(xi->dev, &list); + } + unregister_netdevice_many(&list); + rtnl_unlock(); +} + static struct pernet_operations xfrmi_net_ops = { + .exit_batch = xfrmi_exit_batch_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; From 56b1b7c667fbb9ee395f7506dfef3c04571e024a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 13 May 2020 18:38:54 +0800 Subject: [PATCH 12/77] esp6: calculate transport_header correctly when sel.family != AF_INET6 In esp6_init_state() for beet mode when x->sel.family != AF_INET6: x->props.header_len = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead) + IPV4_BEET_PHMAXLEN + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)) In xfrm6_beet_gso_segment() skb->transport_header is supposed to move to the end of the ph header for IPPROTO_BEETPH, so if x->sel.family != AF_INET6 and it's IPPROTO_BEETPH, it should do: skb->transport_header -= (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); skb->transport_header += ph->hdrlen * 8; And IPV4_BEET_PHMAXLEN is only reserved for PH header, so if x->sel.family != AF_INET6 and it's not IPPROTO_BEETPH, it should do: skb->transport_header -= (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); skb->transport_header -= IPV4_BEET_PHMAXLEN; Thanks Sabrina for looking deep into this issue. Fixes: 7f9e40eb18a9 ("esp6: add gso_segment for esp6 beet mode") Reported-by: Sabrina Dubroca Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/ipv6/esp6_offload.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 9c03460b2760a..ab0eea336c70d 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -175,24 +175,27 @@ static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, skb->transport_header += x->props.header_len; - if (proto == IPPROTO_BEETPH) { - struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; + if (x->sel.family != AF_INET6) { + skb->transport_header -= + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); - skb->transport_header += ph->hdrlen * 8; - proto = ph->nexthdr; - } + if (proto == IPPROTO_BEETPH) { + struct ip_beet_phdr *ph = + (struct ip_beet_phdr *)skb->data; + + skb->transport_header += ph->hdrlen * 8; + proto = ph->nexthdr; + } else { + skb->transport_header -= IPV4_BEET_PHMAXLEN; + } - if (x->sel.family == AF_INET6) { + if (proto == IPPROTO_TCP) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); - } else { - skb->transport_header -= - (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); - - if (proto == IPPROTO_TCP) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; } __skb_pull(skb, skb_transport_offset(skb)); From 3ffb93ba326f40b47b17a4e8b3399c0fa2e8cee6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 May 2020 13:35:19 +0800 Subject: [PATCH 13/77] esp4: improve xfrm4_beet_gso_segment() to be more readable This patch is to improve the code to make xfrm4_beet_gso_segment() more readable, and keep consistent with xfrm6_beet_gso_segment(). Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 9b1d451edae0b..d14133eac4763 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -141,20 +141,23 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, skb->transport_header += x->props.header_len; - if (proto == IPPROTO_BEETPH) { - struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; - - skb->transport_header += ph->hdrlen * 8; - proto = ph->nexthdr; - } else if (x->sel.family == AF_INET6) { + if (x->sel.family != AF_INET6) { + if (proto == IPPROTO_BEETPH) { + struct ip_beet_phdr *ph = + (struct ip_beet_phdr *)skb->data; + + skb->transport_header += ph->hdrlen * 8; + proto = ph->nexthdr; + } else { + skb->transport_header -= IPV4_BEET_PHMAXLEN; + } + } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); if (proto == IPPROTO_TCP) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; - } else { - skb->transport_header -= IPV4_BEET_PHMAXLEN; } __skb_pull(skb, skb_transport_offset(skb)); From e2d4a80f93fcfaf72e2e20daf6a28e39c3b90677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Fri, 22 May 2020 19:04:13 +0200 Subject: [PATCH 14/77] mac80211: mesh: fix discovery timer re-arming issue / crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a non-forwarding 802.11s link between two fairly busy neighboring nodes (iperf with -P 16 at ~850MBit/s TCP; 1733.3 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 4), so with frequent PREQ retries, usually after around 30-40 seconds the following crash would occur: [ 1110.822428] Unable to handle kernel read from unreadable memory at virtual address 00000000 [ 1110.830786] Mem abort info: [ 1110.833573] Exception class = IABT (current EL), IL = 32 bits [ 1110.839494] SET = 0, FnV = 0 [ 1110.842546] EA = 0, S1PTW = 0 [ 1110.845678] user pgtable: 4k pages, 48-bit VAs, pgd = ffff800076386000 [ 1110.852204] [0000000000000000] *pgd=00000000f6322003, *pud=00000000f62de003, *pmd=0000000000000000 [ 1110.861167] Internal error: Oops: 86000004 [#1] PREEMPT SMP [ 1110.866730] Modules linked in: pppoe ppp_async batman_adv ath10k_pci ath10k_core ath pppox ppp_generic nf_conntrack_ipv6 mac80211 iptable_nat ipt_REJECT ipt_MASQUERADE cfg80211 xt_time xt_tcpudp xt_state xt_nat xt_multiport xt_mark xt_mac xt_limit xt_conntrack xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_FLOWOFFLOAD slhc nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_conntrack_ipv4 nf_nat_ipv4 nf_nat nf_log_ipv4 nf_flow_table_hw nf_flow_table nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack iptable_mangle iptable_filter ip_tables crc_ccitt compat nf_log_ipv6 nf_log_common ip6table_mangle ip6table_filter ip6_tables ip6t_REJECT x_tables nf_reject_ipv6 usb_storage xhci_plat_hcd xhci_pci xhci_hcd dwc3 usbcore usb_common [ 1110.932190] Process swapper/3 (pid: 0, stack limit = 0xffff0000090c8000) [ 1110.938884] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.14.162 #0 [ 1110.944965] Hardware name: LS1043A RGW Board (DT) [ 1110.949658] task: ffff8000787a81c0 task.stack: ffff0000090c8000 [ 1110.955568] PC is at 0x0 [ 1110.958097] LR is at call_timer_fn.isra.27+0x24/0x78 [ 1110.963055] pc : [<0000000000000000>] lr : [] pstate: 00400145 [ 1110.970440] sp : ffff00000801be10 [ 1110.973744] x29: ffff00000801be10 x28: ffff000008bf7018 [ 1110.979047] x27: ffff000008bf87c8 x26: ffff000008c160c0 [ 1110.984352] x25: 0000000000000000 x24: 0000000000000000 [ 1110.989657] x23: dead000000000200 x22: 0000000000000000 [ 1110.994959] x21: 0000000000000000 x20: 0000000000000101 [ 1111.000262] x19: ffff8000787a81c0 x18: 0000000000000000 [ 1111.005565] x17: ffff0000089167b0 x16: 0000000000000058 [ 1111.010868] x15: ffff0000089167b0 x14: 0000000000000000 [ 1111.016172] x13: ffff000008916788 x12: 0000000000000040 [ 1111.021475] x11: ffff80007fda9af0 x10: 0000000000000001 [ 1111.026777] x9 : ffff00000801bea0 x8 : 0000000000000004 [ 1111.032080] x7 : 0000000000000000 x6 : ffff80007fda9aa8 [ 1111.037383] x5 : ffff00000801bea0 x4 : 0000000000000010 [ 1111.042685] x3 : ffff00000801be98 x2 : 0000000000000614 [ 1111.047988] x1 : 0000000000000000 x0 : 0000000000000000 [ 1111.053290] Call trace: [ 1111.055728] Exception stack(0xffff00000801bcd0 to 0xffff00000801be10) [ 1111.062158] bcc0: 0000000000000000 0000000000000000 [ 1111.069978] bce0: 0000000000000614 ffff00000801be98 0000000000000010 ffff00000801bea0 [ 1111.077798] bd00: ffff80007fda9aa8 0000000000000000 0000000000000004 ffff00000801bea0 [ 1111.085618] bd20: 0000000000000001 ffff80007fda9af0 0000000000000040 ffff000008916788 [ 1111.093437] bd40: 0000000000000000 ffff0000089167b0 0000000000000058 ffff0000089167b0 [ 1111.101256] bd60: 0000000000000000 ffff8000787a81c0 0000000000000101 0000000000000000 [ 1111.109075] bd80: 0000000000000000 dead000000000200 0000000000000000 0000000000000000 [ 1111.116895] bda0: ffff000008c160c0 ffff000008bf87c8 ffff000008bf7018 ffff00000801be10 [ 1111.124715] bdc0: ffff0000080ff29c ffff00000801be10 0000000000000000 0000000000400145 [ 1111.132534] bde0: ffff8000787a81c0 ffff00000801bde8 0000ffffffffffff 000001029eb19be8 [ 1111.140353] be00: ffff00000801be10 0000000000000000 [ 1111.145220] [< (null)>] (null) [ 1111.149917] [] run_timer_softirq+0x184/0x398 [ 1111.155741] [] __do_softirq+0x100/0x1fc [ 1111.161130] [] irq_exit+0x80/0xd8 [ 1111.166002] [] __handle_domain_irq+0x88/0xb0 [ 1111.171825] [] gic_handle_irq+0x68/0xb0 [ 1111.177213] Exception stack(0xffff0000090cbe30 to 0xffff0000090cbf70) [ 1111.183642] be20: 0000000000000020 0000000000000000 [ 1111.191461] be40: 0000000000000001 0000000000000000 00008000771af000 0000000000000000 [ 1111.199281] be60: ffff000008c95180 0000000000000000 ffff000008c19360 ffff0000090cbef0 [ 1111.207101] be80: 0000000000000810 0000000000000400 0000000000000098 ffff000000000000 [ 1111.214920] bea0: 0000000000000001 ffff0000089167b0 0000000000000000 ffff0000089167b0 [ 1111.222740] bec0: 0000000000000000 ffff000008c198e8 ffff000008bf7018 ffff000008c19000 [ 1111.230559] bee0: 0000000000000000 0000000000000000 ffff8000787a81c0 ffff000008018000 [ 1111.238380] bf00: ffff00000801c000 ffff00000913ba34 ffff8000787a81c0 ffff0000090cbf70 [ 1111.246199] bf20: ffff0000080857cc ffff0000090cbf70 ffff0000080857d0 0000000000400145 [ 1111.254020] bf40: ffff000008018000 ffff00000801c000 ffffffffffffffff ffff0000080fa574 [ 1111.261838] bf60: ffff0000090cbf70 ffff0000080857d0 [ 1111.266706] [] el1_irq+0xe8/0x18c [ 1111.271576] [] arch_cpu_idle+0x10/0x18 [ 1111.276880] [] do_idle+0xec/0x1b8 [ 1111.281748] [] cpu_startup_entry+0x20/0x28 [ 1111.287399] [] secondary_start_kernel+0x104/0x110 [ 1111.293662] Code: bad PC value [ 1111.296710] ---[ end trace 555b6ca4363c3edd ]--- [ 1111.301318] Kernel panic - not syncing: Fatal exception in interrupt [ 1111.307661] SMP: stopping secondary CPUs [ 1111.311574] Kernel Offset: disabled [ 1111.315053] CPU features: 0x0002000 [ 1111.318530] Memory Limit: none [ 1111.321575] Rebooting in 3 seconds.. With some added debug output / delays we were able to push the crash from the timer callback runner into the callback function and by that shedding some light on which object holding the timer gets corrupted: [ 401.720899] Unable to handle kernel read from unreadable memory at virtual address 00000868 [...] [ 402.335836] [] _raw_spin_lock_bh+0x14/0x48 [ 402.341548] [] mesh_path_timer+0x10c/0x248 [mac80211] [ 402.348154] [] call_timer_fn.isra.27+0x24/0x78 [ 402.354150] [] run_timer_softirq+0x184/0x398 [ 402.359974] [] __do_softirq+0x100/0x1fc [ 402.365362] [] irq_exit+0x80/0xd8 [ 402.370231] [] __handle_domain_irq+0x88/0xb0 [ 402.376053] [] gic_handle_irq+0x68/0xb0 The issue happens due to the following sequence of events: 1) mesh_path_start_discovery(): -> spin_unlock_bh(&mpath->state_lock) before mesh_path_sel_frame_tx() 2) mesh_path_free_rcu() -> del_timer_sync(&mpath->timer) [...] -> kfree_rcu(mpath) 3) mesh_path_start_discovery(): -> mod_timer(&mpath->timer, ...) [...] -> rcu_read_unlock() 4) mesh_path_free_rcu()'s kfree_rcu(): -> kfree(mpath) 5) mesh_path_timer() starts after timeout, using freed mpath object So a use-after-free issue due to a timer re-arming bug caused by an early spin-unlocking. This patch fixes this issue by re-checking if mpath is about to be free'd and if so bails out of re-arming the timer. Cc: stable@vger.kernel.org Fixes: 050ac52cbe1f ("mac80211: code for on-demand Hybrid Wireless Mesh Protocol") Cc: Simon Wunderlich Signed-off-by: Linus Lüssing Link: https://lore.kernel.org/r/20200522170413.14973-1-linus.luessing@c0d3.blue Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 38a0383dfbcfa..aa5150929996d 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1103,7 +1103,14 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) mesh_path_sel_frame_tx(MPATH_PREQ, 0, sdata->vif.addr, ifmsh->sn, target_flags, mpath->dst, mpath->sn, da, 0, ttl, lifetime, 0, ifmsh->preq_id++, sdata); + + spin_lock_bh(&mpath->state_lock); + if (mpath->flags & MESH_PATH_DELETED) { + spin_unlock_bh(&mpath->state_lock); + goto enddiscovery; + } mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout); + spin_unlock_bh(&mpath->state_lock); enddiscovery: rcu_read_unlock(); From d031781bdabe1027858a3220f868866586bf6e7c Mon Sep 17 00:00:00 2001 From: Pradeep Kumar Chitrapu Date: Wed, 6 May 2020 03:24:30 -0700 Subject: [PATCH 15/77] ieee80211: Fix incorrect mask for default PE duration Fixes bitmask for HE opration's default PE duration. Fixes: daa5b83513a7 ("mac80211: update HE operation fields to D3.0") Signed-off-by: Pradeep Kumar Chitrapu Link: https://lore.kernel.org/r/20200506102430.5153-1-pradeepc@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 16268ef1cbccc..5d3e48d020339 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2047,7 +2047,7 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) } /* HE Operation defines */ -#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000003 +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000007 #define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000008 #define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK 0x00003ff0 #define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET 4 From 0bbab5f0301587cad4e923ccc49bb910db86162c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 25 May 2020 11:38:17 +0200 Subject: [PATCH 16/77] cfg80211: fix debugfs rename crash Removing the "if (IS_ERR(dir)) dir = NULL;" check only works if we adjust the remaining code to not rely on it being NULL. Check IS_ERR_OR_NULL() before attempting to dereference it. I'm not actually entirely sure this fixes the syzbot crash as the kernel config indicates that they do have DEBUG_FS in the kernel, but this is what I found when looking there. Cc: stable@vger.kernel.org Fixes: d82574a8e5a4 ("cfg80211: no need to check return value of debugfs_create functions") Reported-by: syzbot+fd5332e429401bf42d18@syzkaller.appspotmail.com Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20200525113816.fc4da3ec3d4b.Ica63a110679819eaa9fb3bc1b7437d96b1fd187d@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/core.c b/net/wireless/core.c index 341402b4f1788..ce024440fa51f 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -142,7 +142,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, if (result) return result; - if (rdev->wiphy.debugfsdir) + if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir)) debugfs_rename(rdev->wiphy.debugfsdir->d_parent, rdev->wiphy.debugfsdir, rdev->wiphy.debugfsdir->d_parent, newname); From ed17b8d377eaf6b4a01d46942b4c647378a79bdd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 25 May 2020 13:53:37 +0800 Subject: [PATCH 17/77] xfrm: fix a warning in xfrm_policy_insert_list This waring can be triggered simply by: # ip xfrm policy update src 192.168.1.1/24 dst 192.168.1.2/24 dir in \ priority 1 mark 0 mask 0x10 #[1] # ip xfrm policy update src 192.168.1.1/24 dst 192.168.1.2/24 dir in \ priority 2 mark 0 mask 0x1 #[2] # ip xfrm policy update src 192.168.1.1/24 dst 192.168.1.2/24 dir in \ priority 2 mark 0 mask 0x10 #[3] Then dmesg shows: [ ] WARNING: CPU: 1 PID: 7265 at net/xfrm/xfrm_policy.c:1548 [ ] RIP: 0010:xfrm_policy_insert_list+0x2f2/0x1030 [ ] Call Trace: [ ] xfrm_policy_inexact_insert+0x85/0xe50 [ ] xfrm_policy_insert+0x4ba/0x680 [ ] xfrm_add_policy+0x246/0x4d0 [ ] xfrm_user_rcv_msg+0x331/0x5c0 [ ] netlink_rcv_skb+0x121/0x350 [ ] xfrm_netlink_rcv+0x66/0x80 [ ] netlink_unicast+0x439/0x630 [ ] netlink_sendmsg+0x714/0xbf0 [ ] sock_sendmsg+0xe2/0x110 The issue was introduced by Commit 7cb8a93968e3 ("xfrm: Allow inserting policies with matching mark and different priorities"). After that, the policies [1] and [2] would be able to be added with different priorities. However, policy [3] will actually match both [1] and [2]. Policy [1] was matched due to the 1st 'return true' in xfrm_policy_mark_match(), and policy [2] was matched due to the 2nd 'return true' in there. It caused WARN_ON() in xfrm_policy_insert_list(). This patch is to fix it by only (the same value and priority) as the same policy in xfrm_policy_mark_match(). Thanks to Yuehaibing, we could make this fix better. v1->v2: - check policy->mark.v == pol->mark.v only without mask. Fixes: 7cb8a93968e3 ("xfrm: Allow inserting policies with matching mark and different priorities") Reported-by: Xiumei Mu Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 297b2fdb3c297..564aa6492e7c3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1436,12 +1436,7 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, static bool xfrm_policy_mark_match(struct xfrm_policy *policy, struct xfrm_policy *pol) { - u32 mark = policy->mark.v & policy->mark.m; - - if (policy->mark.v == pol->mark.v && policy->mark.m == pol->mark.m) - return true; - - if ((mark & pol->mark.m) == pol->mark.v && + if (policy->mark.v == pol->mark.v && policy->priority == pol->priority) return true; From e9c284ec4b41c827f4369973d2792992849e4fa5 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Wed, 6 May 2020 11:46:25 +0200 Subject: [PATCH 18/77] netfilter: nft_reject_bridge: enable reject with bridge vlan Currently, using the bridge reject target with tagged packets results in untagged packets being sent back. Fix this by mirroring the vlan id as well. Fixes: 85f5b3086a04 ("netfilter: bridge: add reject support") Signed-off-by: Michael Braun Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_reject_bridge.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index b325b569e7615..f48cf4cfb80f9 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -31,6 +31,12 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, ether_addr_copy(eth->h_dest, eth_hdr(oldskb)->h_source); eth->h_proto = eth_hdr(oldskb)->h_proto; skb_pull(nskb, ETH_HLEN); + + if (skb_vlan_tag_present(oldskb)) { + u16 vid = skb_vlan_tag_get(oldskb); + + __vlan_hwaccel_put_tag(nskb, oldskb->vlan_proto, vid); + } } static int nft_bridge_iphdr_validate(struct sk_buff *skb) From a164b95ad6055c50612795882f35e0efda1f1390 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 14 May 2020 13:31:21 +0200 Subject: [PATCH 19/77] netfilter: ipset: Fix subcounter update skip If IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE is set, user requested to not update counters in sub sets. Therefore IPSET_FLAG_SKIP_COUNTER_UPDATE must be set, not unset. Fixes: 6e01781d1c80e ("netfilter: ipset: set match: add support to match the counters") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_list_set.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index cd747c0962fd6..5a67f79665742 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -59,7 +59,7 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) - opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; + opt->cmdflags |= IPSET_FLAG_SKIP_COUNTER_UPDATE; list_for_each_entry_rcu(e, &map->members, list) { ret = ip_set_test(e->id, skb, par, opt); if (ret <= 0) From 4c559f15efcc43b996f4da528cd7f9483aaca36d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 May 2020 14:14:23 +0200 Subject: [PATCH 20/77] netfilter: nf_conntrack_pptp: prevent buffer overflows in debug code Dan Carpenter says: "Smatch complains that the value for "cmd" comes from the network and can't be trusted." Add pptp_msg_name() helper function that checks for the array boundary. Fixes: f09943fefe6b ("[NETFILTER]: nf_conntrack/nf_nat: add PPTP helper port") Reported-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_pptp.h | 2 +- net/ipv4/netfilter/nf_nat_pptp.c | 7 +-- net/netfilter/nf_conntrack_pptp.c | 62 ++++++++++++--------- 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_pptp.h b/include/linux/netfilter/nf_conntrack_pptp.h index fcc409de31a40..6a4ff6d5ebc29 100644 --- a/include/linux/netfilter/nf_conntrack_pptp.h +++ b/include/linux/netfilter/nf_conntrack_pptp.h @@ -10,7 +10,7 @@ #include #include -extern const char *const pptp_msg_name[]; +extern const char *const pptp_msg_name(u_int16_t msg); /* state of the control session */ enum pptp_ctrlsess_state { diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 3c25a467b3efc..7afde8828b4c9 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -166,8 +166,7 @@ pptp_outbound_pkt(struct sk_buff *skb, break; default: pr_debug("unknown outbound packet 0x%04x:%s\n", msg, - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : - pptp_msg_name[0]); + pptp_msg_name(msg)); fallthrough; case PPTP_SET_LINK_INFO: /* only need to NAT in case PAC is behind NAT box */ @@ -268,9 +267,7 @@ pptp_inbound_pkt(struct sk_buff *skb, pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID); break; default: - pr_debug("unknown inbound packet %s\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : - pptp_msg_name[0]); + pr_debug("unknown inbound packet %s\n", pptp_msg_name(msg)); fallthrough; case PPTP_START_SESSION_REQUEST: case PPTP_START_SESSION_REPLY: diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index a971183f11af7..7ad247784cfaa 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -72,24 +72,32 @@ EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) /* PptpControlMessageType names */ -const char *const pptp_msg_name[] = { - "UNKNOWN_MESSAGE", - "START_SESSION_REQUEST", - "START_SESSION_REPLY", - "STOP_SESSION_REQUEST", - "STOP_SESSION_REPLY", - "ECHO_REQUEST", - "ECHO_REPLY", - "OUT_CALL_REQUEST", - "OUT_CALL_REPLY", - "IN_CALL_REQUEST", - "IN_CALL_REPLY", - "IN_CALL_CONNECT", - "CALL_CLEAR_REQUEST", - "CALL_DISCONNECT_NOTIFY", - "WAN_ERROR_NOTIFY", - "SET_LINK_INFO" +static const char *const pptp_msg_name_array[PPTP_MSG_MAX + 1] = { + [0] = "UNKNOWN_MESSAGE", + [PPTP_START_SESSION_REQUEST] = "START_SESSION_REQUEST", + [PPTP_START_SESSION_REPLY] = "START_SESSION_REPLY", + [PPTP_STOP_SESSION_REQUEST] = "STOP_SESSION_REQUEST", + [PPTP_STOP_SESSION_REPLY] = "STOP_SESSION_REPLY", + [PPTP_ECHO_REQUEST] = "ECHO_REQUEST", + [PPTP_ECHO_REPLY] = "ECHO_REPLY", + [PPTP_OUT_CALL_REQUEST] = "OUT_CALL_REQUEST", + [PPTP_OUT_CALL_REPLY] = "OUT_CALL_REPLY", + [PPTP_IN_CALL_REQUEST] = "IN_CALL_REQUEST", + [PPTP_IN_CALL_REPLY] = "IN_CALL_REPLY", + [PPTP_IN_CALL_CONNECT] = "IN_CALL_CONNECT", + [PPTP_CALL_CLEAR_REQUEST] = "CALL_CLEAR_REQUEST", + [PPTP_CALL_DISCONNECT_NOTIFY] = "CALL_DISCONNECT_NOTIFY", + [PPTP_WAN_ERROR_NOTIFY] = "WAN_ERROR_NOTIFY", + [PPTP_SET_LINK_INFO] = "SET_LINK_INFO" }; + +const char *const pptp_msg_name(u_int16_t msg) +{ + if (msg > PPTP_MSG_MAX) + return pptp_msg_name_array[0]; + + return pptp_msg_name_array[msg]; +} EXPORT_SYMBOL(pptp_msg_name); #endif @@ -276,7 +284,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; msg = ntohs(ctlh->messageType); - pr_debug("inbound control message %s\n", pptp_msg_name[msg]); + pr_debug("inbound control message %s\n", pptp_msg_name(msg)); switch (msg) { case PPTP_START_SESSION_REPLY: @@ -311,7 +319,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, pcid = pptpReq->ocack.peersCallID; if (info->pns_call_id != pcid) goto invalid; - pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], + pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name(msg), ntohs(cid), ntohs(pcid)); if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { @@ -328,7 +336,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, goto invalid; cid = pptpReq->icreq.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->cstate = PPTP_CALL_IN_REQ; info->pac_call_id = cid; break; @@ -347,7 +355,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, if (info->pns_call_id != pcid) goto invalid; - pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); + pr_debug("%s, PCID=%X\n", pptp_msg_name(msg), ntohs(pcid)); info->cstate = PPTP_CALL_IN_CONF; /* we expect a GRE connection from PAC to PNS */ @@ -357,7 +365,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, case PPTP_CALL_DISCONNECT_NOTIFY: /* server confirms disconnect */ cid = pptpReq->disc.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->cstate = PPTP_CALL_NONE; /* untrack this call id, unexpect GRE packets */ @@ -384,7 +392,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, invalid: pr_debug("invalid %s: type=%d cid=%u pcid=%u " "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + pptp_msg_name(msg), msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, ntohs(info->pns_call_id), ntohs(info->pac_call_id)); return NF_ACCEPT; @@ -404,7 +412,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; msg = ntohs(ctlh->messageType); - pr_debug("outbound control message %s\n", pptp_msg_name[msg]); + pr_debug("outbound control message %s\n", pptp_msg_name(msg)); switch (msg) { case PPTP_START_SESSION_REQUEST: @@ -426,7 +434,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, info->cstate = PPTP_CALL_OUT_REQ; /* track PNS call id */ cid = pptpReq->ocreq.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->pns_call_id = cid; break; @@ -440,7 +448,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, pcid = pptpReq->icack.peersCallID; if (info->pac_call_id != pcid) goto invalid; - pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], + pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name(msg), ntohs(cid), ntohs(pcid)); if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { @@ -480,7 +488,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, invalid: pr_debug("invalid %s: type=%d cid=%u pcid=%u " "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + pptp_msg_name(msg), msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, ntohs(info->pns_call_id), ntohs(info->pac_call_id)); return NF_ACCEPT; From ee04805ff54a63ffd90bc6749ebfe73473734ddb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 24 May 2020 19:52:10 +0200 Subject: [PATCH 21/77] netfilter: conntrack: make conntrack userspace helpers work again Florian Westphal says: "Problem is that after the helper hook was merged back into the confirm one, the queueing itself occurs from the confirm hook, i.e. we queue from the last netfilter callback in the hook-list. Therefore, on return, the packet bypasses the confirm action and the connection is never committed to the main conntrack table. To fix this there are several ways: 1. revert the 'Fixes' commit and have a extra helper hook again. Works, but has the drawback of adding another indirect call for everyone. 2. Special case this: split the hooks only when userspace helper gets added, so queueing occurs at a lower priority again, and normal enqueue reinject would eventually call the last hook. 3. Extend the existing nf_queue ct update hook to allow a forced confirmation (plus run the seqadj code). This goes for 3)." Fixes: 827318feb69cb ("netfilter: conntrack: remove helper hook again") Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 78 ++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1d57b95d34819..08e0c19f6b396 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2016,22 +2016,18 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) nf_conntrack_get(skb_nfct(nskb)); } -static int nf_conntrack_update(struct net *net, struct sk_buff *skb) +static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, + struct nf_conn *ct) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct nf_nat_hook *nat_hook; unsigned int status; - struct nf_conn *ct; int dataoff; u16 l3num; u8 l4num; - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_confirmed(ct)) - return 0; - l3num = nf_ct_l3num(ct); dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); @@ -2088,6 +2084,76 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; } +/* This packet is coming from userspace via nf_queue, complete the packet + * processing after the helper invocation in nf_confirm(). + */ +static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + unsigned int protoff; + + help = nfct_help(ct); + if (!help) + return 0; + + helper = rcu_dereference(help->helper); + if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) + return 0; + + switch (nf_ct_l3num(ct)) { + case NFPROTO_IPV4: + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: { + __be16 frag_off; + u8 pnum; + + pnum = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return 0; + break; + } +#endif + default: + return 0; + } + + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return -1; + } + } + + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; +} + +static int nf_conntrack_update(struct net *net, struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int err; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + if (!nf_ct_is_confirmed(ct)) { + err = __nf_conntrack_update(net, skb, ct); + if (err < 0) + return err; + } + + return nf_confirm_cthelper(skb, ct, ctinfo); +} + static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { From 703acd70f2496537457186211c2f03e792409e68 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 24 May 2020 21:04:42 +0200 Subject: [PATCH 22/77] netfilter: nfnetlink_cthelper: unbreak userspace helper support Restore helper data size initialization and fix memcopy of the helper data size. Fixes: 157ffffeb5dc ("netfilter: nfnetlink_cthelper: reject too large userspace allocation requests") Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cthelper.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index a5f294aa8e4cf..5b0d0a77379c6 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -103,7 +103,7 @@ nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) if (help->helper->data_len == 0) return -EINVAL; - nla_memcpy(help->data, nla_data(attr), sizeof(help->data)); + nla_memcpy(help->data, attr, sizeof(help->data)); return 0; } @@ -240,6 +240,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], ret = -ENOMEM; goto err2; } + helper->data_len = size; helper->flags |= NF_CT_HELPER_F_USERSPACE; memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); From b16a87d0aef7a6be766f6618976dc5ff2c689291 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 25 May 2020 10:03:59 +0200 Subject: [PATCH 23/77] xsk: Add overflow check for u64 division, stored into u32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The npgs member of struct xdp_umem is an u32 entity, and stores the number of pages the UMEM consumes. The calculation of npgs npgs = size / PAGE_SIZE can overflow. To avoid overflow scenarios, the division is now first stored in a u64, and the result is verified to fit into 32b. An alternative would be storing the npgs as a u64, however, this wastes memory and is an unrealisticly large packet area. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Reported-by: "Minh Bùi Quang" Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/CACtPs=GGvV-_Yj6rbpzTVnopgi5nhMoCcTkSkYrJHGQHJWFZMQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20200525080400.13195-1-bjorn.topel@gmail.com --- net/xdp/xdp_umem.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index ed7a6060f73ca..3889bd9aec466 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -341,8 +341,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; u32 chunk_size = mr->chunk_size, headroom = mr->headroom; + u64 npgs, addr = mr->addr, size = mr->len; unsigned int chunks, chunks_per_page; - u64 addr = mr->addr, size = mr->len; int err; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { @@ -372,6 +372,10 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; + npgs = div_u64(size, PAGE_SIZE); + if (npgs > U32_MAX) + return -EINVAL; + chunks = (unsigned int)div_u64(size, chunk_size); if (chunks == 0) return -EINVAL; @@ -391,7 +395,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; - umem->npgs = size / PAGE_SIZE; + umem->npgs = (u32)npgs; umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; From 0cada33241d9de205522e3858b18e506ca5cce2c Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Sat, 23 May 2020 01:40:31 +0530 Subject: [PATCH 24/77] net/tls: fix race condition causing kernel panic tls_sw_recvmsg() and tls_decrypt_done() can be run concurrently. // tls_sw_recvmsg() if (atomic_read(&ctx->decrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); else reinit_completion(&ctx->async_wait.completion); //tls_decrypt_done() pending = atomic_dec_return(&ctx->decrypt_pending); if (!pending && READ_ONCE(ctx->async_notify)) complete(&ctx->async_wait.completion); Consider the scenario tls_decrypt_done() is about to run complete() if (!pending && READ_ONCE(ctx->async_notify)) and tls_sw_recvmsg() reads decrypt_pending == 0, does reinit_completion(), then tls_decrypt_done() runs complete(). This sequence of execution results in wrong completion. Consequently, for next decrypt request, it will not wait for completion, eventually on connection close, crypto resources freed, there is no way to handle pending decrypt response. This race condition can be avoided by having atomic_read() mutually exclusive with atomic_dec_return(),complete().Intoduced spin lock to ensure the mutual exclution. Addressed similar problem in tx direction. v1->v2: - More readable commit message. - Corrected the lock to fix new race scenario. - Removed barrier which is not needed now. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Signed-off-by: Vinay Kumar Yadav Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tls.h | 4 ++++ net/tls/tls_sw.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index bf9eb48239332..18cd4f418464d 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -135,6 +135,8 @@ struct tls_sw_context_tx { struct tls_rec *open_rec; struct list_head tx_list; atomic_t encrypt_pending; + /* protect crypto_wait with encrypt_pending */ + spinlock_t encrypt_compl_lock; int async_notify; u8 async_capable:1; @@ -155,6 +157,8 @@ struct tls_sw_context_rx { u8 async_capable:1; u8 decrypted:1; atomic_t decrypt_pending; + /* protect crypto_wait with decrypt_pending*/ + spinlock_t decrypt_compl_lock; bool async_notify; }; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2d399b6c40756..8c2763eb6aae2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -206,10 +206,12 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) kfree(aead_req); + spin_lock_bh(&ctx->decrypt_compl_lock); pending = atomic_dec_return(&ctx->decrypt_pending); - if (!pending && READ_ONCE(ctx->async_notify)) + if (!pending && ctx->async_notify) complete(&ctx->async_wait.completion); + spin_unlock_bh(&ctx->decrypt_compl_lock); } static int tls_do_decryption(struct sock *sk, @@ -467,10 +469,12 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) ready = true; } + spin_lock_bh(&ctx->encrypt_compl_lock); pending = atomic_dec_return(&ctx->encrypt_pending); - if (!pending && READ_ONCE(ctx->async_notify)) + if (!pending && ctx->async_notify) complete(&ctx->async_wait.completion); + spin_unlock_bh(&ctx->encrypt_compl_lock); if (!ready) return; @@ -929,6 +933,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int num_zc = 0; int orig_size; int ret = 0; + int pending; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; @@ -1095,13 +1100,19 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } else if (num_zc) { /* Wait for pending encryptions to get completed */ - smp_store_mb(ctx->async_notify, true); + spin_lock_bh(&ctx->encrypt_compl_lock); + ctx->async_notify = true; - if (atomic_read(&ctx->encrypt_pending)) + pending = atomic_read(&ctx->encrypt_pending); + spin_unlock_bh(&ctx->encrypt_compl_lock); + if (pending) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); else reinit_completion(&ctx->async_wait.completion); + /* There can be no concurrent accesses, since we have no + * pending encrypt operations + */ WRITE_ONCE(ctx->async_notify, false); if (ctx->async_wait.err) { @@ -1732,6 +1743,7 @@ int tls_sw_recvmsg(struct sock *sk, bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; int num_async = 0; + int pending; flags |= nonblock; @@ -1894,8 +1906,11 @@ int tls_sw_recvmsg(struct sock *sk, recv_end: if (num_async) { /* Wait for all previously submitted records to be decrypted */ - smp_store_mb(ctx->async_notify, true); - if (atomic_read(&ctx->decrypt_pending)) { + spin_lock_bh(&ctx->decrypt_compl_lock); + ctx->async_notify = true; + pending = atomic_read(&ctx->decrypt_pending); + spin_unlock_bh(&ctx->decrypt_compl_lock); + if (pending) { err = crypto_wait_req(-EINPROGRESS, &ctx->async_wait); if (err) { /* one of async decrypt failed */ @@ -1907,6 +1922,10 @@ int tls_sw_recvmsg(struct sock *sk, } else { reinit_completion(&ctx->async_wait.completion); } + + /* There can be no concurrent accesses, since we have no + * pending decrypt operations + */ WRITE_ONCE(ctx->async_notify, false); /* Drain records from the rx_list & copy if required */ @@ -2293,6 +2312,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) if (tx) { crypto_init_wait(&sw_ctx_tx->async_wait); + spin_lock_init(&sw_ctx_tx->encrypt_compl_lock); crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; @@ -2301,6 +2321,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sw_ctx_tx->tx_work.sk = sk; } else { crypto_init_wait(&sw_ctx_rx->async_wait); + spin_lock_init(&sw_ctx_rx->decrypt_compl_lock); crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; skb_queue_head_init(&sw_ctx_rx->rx_list); From 5d14c304bfc14b4fd052dc83d5224376b48f52f0 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 25 May 2020 00:22:51 +0300 Subject: [PATCH 25/77] dpaa_eth: fix usage as DSA master, try 3 The dpaa-eth driver probes on compatible string for the MAC node, and the fman/mac.c driver allocates a dpaa-ethernet platform device that triggers the probing of the dpaa-eth net device driver. All of this is fine, but the problem is that the struct device of the dpaa_eth net_device is 2 parents away from the MAC which can be referenced via of_node. So of_find_net_device_by_node can't find it, and DSA switches won't be able to probe on top of FMan ports. It would be a bit silly to modify a core function (of_find_net_device_by_node) to look for dev->parent->parent->of_node just for one driver. We're just 1 step away from implementing full recursion. Actually there have already been at least 2 previous attempts to make this work: - Commit a1a50c8e4c24 ("fsl/man: Inherit parent device and of_node") - One or more of the patches in "[v3,0/6] adapt DPAA drivers for DSA": https://patchwork.ozlabs.org/project/netdev/cover/1508178970-28945-1-git-send-email-madalin.bucur@nxp.com/ (I couldn't really figure out which one was supposed to solve the problem and how). Point being, it looks like this is still pretty much a problem today. On T1040, the /sys/class/net/eth0 symlink currently points to ../../devices/platform/ffe000000.soc/ffe400000.fman/ffe4e6000.ethernet/dpaa-ethernet.0/net/eth0 which pretty much illustrates the problem. The closest of_node we've got is the "fsl,fman-memac" at /soc@ffe000000/fman@400000/ethernet@e6000, which is what we'd like to be able to reference from DSA as host port. For of_find_net_device_by_node to find the eth0 port, we would need the parent of the eth0 net_device to not be the "dpaa-ethernet" platform device, but to point 1 level higher, aka the "fsl,fman-memac" node directly. The new sysfs path would look like this: ../../devices/platform/ffe000000.soc/ffe400000.fman/ffe4e6000.ethernet/net/eth0 And this is exactly what SET_NETDEV_DEV does. It sets the parent of the net_device. The new parent has an of_node associated with it, and of_dev_node_match already checks for the of_node of the device or of its parent. Fixes: a1a50c8e4c24 ("fsl/man: Inherit parent device and of_node") Fixes: c6e26ea8c893 ("dpaa_eth: change device used") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 2cd1f8efdfa3a..6bfa7575af942 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2914,7 +2914,7 @@ static int dpaa_eth_probe(struct platform_device *pdev) } /* Do this here, so we can be verbose early */ - SET_NETDEV_DEV(net_dev, dev); + SET_NETDEV_DEV(net_dev, dev->parent); dev_set_drvdata(dev, net_dev); priv = netdev_priv(net_dev); From 15c973858903009e995b2037683de29dfe968621 Mon Sep 17 00:00:00 2001 From: Qiushi Wu Date: Mon, 25 May 2020 03:24:39 -0500 Subject: [PATCH 26/77] qlcnic: fix missing release in qlcnic_83xx_interrupt_test. In function qlcnic_83xx_interrupt_test(), function qlcnic_83xx_diag_alloc_res() is not handled by function qlcnic_83xx_diag_free_res() after a call of the function qlcnic_alloc_mbx_args() failed. Fix this issue by adding a jump target "fail_mbx_args", and jump to this new target when qlcnic_alloc_mbx_args() failed. Fixes: b6b4316c8b2f ("qlcnic: Handle qlcnic_alloc_mbx_args() failure") Signed-off-by: Qiushi Wu Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index 2a533280b1241..29b9c728a65e2 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -3651,7 +3651,7 @@ int qlcnic_83xx_interrupt_test(struct net_device *netdev) ahw->diag_cnt = 0; ret = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_INTRPT_TEST); if (ret) - goto fail_diag_irq; + goto fail_mbx_args; if (adapter->flags & QLCNIC_MSIX_ENABLED) intrpt_id = ahw->intr_tbl[0].id; @@ -3681,6 +3681,8 @@ int qlcnic_83xx_interrupt_test(struct net_device *netdev) done: qlcnic_free_mbx_args(&cmd); + +fail_mbx_args: qlcnic_83xx_diag_free_res(netdev, drv_sds_rings); fail_diag_irq: From ac21753a5c2c9a6a2019997481a2ac12bbde48c8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 May 2020 12:56:14 -0600 Subject: [PATCH 27/77] nexthops: Move code from remove_nexthop_from_groups to remove_nh_grp_entry Move nh_grp dereference and check for removing nexthop group due to all members gone into remove_nh_grp_entry. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: David Ahern Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 715e14475220f..b4b772f120a75 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -694,17 +694,21 @@ static void nh_group_rebalance(struct nh_group *nhg) } } -static void remove_nh_grp_entry(struct nh_grp_entry *nhge, - struct nh_group *nhg, +static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { + struct nexthop *nhp = nhge->nh_parent; struct nexthop *nh = nhge->nh; struct nh_grp_entry *nhges; + struct nh_group *nhg; bool found = false; int i; WARN_ON(!nh); + list_del(&nhge->nh_list); + + nhg = rtnl_dereference(nhp->nh_grp); nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; ++i) { if (found) { @@ -728,7 +732,11 @@ static void remove_nh_grp_entry(struct nh_grp_entry *nhge, nexthop_put(nh); if (nlinfo) - nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo); + nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); + + /* if this group has no more entries then remove it */ + if (!nhg->num_nh) + remove_nexthop(net, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, @@ -736,17 +744,8 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, { struct nh_grp_entry *nhge, *tmp; - list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { - struct nh_group *nhg; - - list_del(&nhge->nh_list); - nhg = rtnl_dereference(nhge->nh_parent->nh_grp); - remove_nh_grp_entry(nhge, nhg, nlinfo); - - /* if this group has no more entries then remove it */ - if (!nhg->num_nh) - remove_nexthop(net, nhge->nh_parent, nlinfo); - } + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) + remove_nh_grp_entry(net, nhge, nlinfo); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) From 90f33bffa382598a32cc82abfeb20adc92d041b6 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 26 May 2020 12:56:15 -0600 Subject: [PATCH 28/77] nexthops: don't modify published nexthop groups We must avoid modifying published nexthop groups while they might be in use, otherwise we might see NULL ptr dereferences. In order to do that we allocate 2 nexthoup group structures upon nexthop creation and swap between them when we have to delete an entry. The reason is that we can't fail nexthop group removal, so we can't handle allocation failure thus we move the extra allocation on creation where we can safely fail and return ENOMEM. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 1 + net/ipv4/nexthop.c | 91 +++++++++++++++++++++++++++---------------- 2 files changed, 59 insertions(+), 33 deletions(-) diff --git a/include/net/nexthop.h b/include/net/nexthop.h index c440ccc861fc7..8a343519ed7a1 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -70,6 +70,7 @@ struct nh_grp_entry { }; struct nh_group { + struct nh_group *spare; /* spare group for removals */ u16 num_nh; bool mpath; bool has_v4; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index b4b772f120a75..563f71bcb2d74 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -63,9 +63,16 @@ static void nexthop_free_mpath(struct nexthop *nh) int i; nhg = rcu_dereference_raw(nh->nh_grp); - for (i = 0; i < nhg->num_nh; ++i) - WARN_ON(nhg->nh_entries[i].nh); + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + WARN_ON(!list_empty(&nhge->nh_list)); + nexthop_put(nhge->nh); + } + + WARN_ON(nhg->spare == nhg); + + kfree(nhg->spare); kfree(nhg); } @@ -697,46 +704,53 @@ static void nh_group_rebalance(struct nh_group *nhg) static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { + struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct nexthop *nh = nhge->nh; - struct nh_grp_entry *nhges; - struct nh_group *nhg; - bool found = false; - int i; + struct nh_group *nhg, *newg; + int i, j; WARN_ON(!nh); - list_del(&nhge->nh_list); - nhg = rtnl_dereference(nhp->nh_grp); - nhges = nhg->nh_entries; - for (i = 0; i < nhg->num_nh; ++i) { - if (found) { - nhges[i-1].nh = nhges[i].nh; - nhges[i-1].weight = nhges[i].weight; - list_del(&nhges[i].nh_list); - list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list); - } else if (nhg->nh_entries[i].nh == nh) { - found = true; - } - } + newg = nhg->spare; - if (WARN_ON(!found)) + /* last entry, keep it visible and remove the parent */ + if (nhg->num_nh == 1) { + remove_nexthop(net, nhp, nlinfo); return; + } - nhg->num_nh--; - nhg->nh_entries[nhg->num_nh].nh = NULL; + newg->has_v4 = nhg->has_v4; + newg->mpath = nhg->mpath; + newg->num_nh = nhg->num_nh; - nh_group_rebalance(nhg); + /* copy old entries to new except the one getting removed */ + nhges = nhg->nh_entries; + new_nhges = newg->nh_entries; + for (i = 0, j = 0; i < nhg->num_nh; ++i) { + /* current nexthop getting removed */ + if (nhg->nh_entries[i].nh == nh) { + newg->num_nh--; + continue; + } - nexthop_put(nh); + list_del(&nhges[i].nh_list); + new_nhges[j].nh_parent = nhges[i].nh_parent; + new_nhges[j].nh = nhges[i].nh; + new_nhges[j].weight = nhges[i].weight; + list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); + j++; + } + + nh_group_rebalance(newg); + rcu_assign_pointer(nhp->nh_grp, newg); + + list_del(&nhge->nh_list); + nexthop_put(nhge->nh); if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); - - /* if this group has no more entries then remove it */ - if (!nhg->num_nh) - remove_nexthop(net, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, @@ -746,6 +760,9 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); + + /* make sure all see the newly published array before releasing rtnl */ + synchronize_rcu(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) @@ -759,10 +776,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) if (WARN_ON(!nhge->nh)) continue; - list_del(&nhge->nh_list); - nexthop_put(nhge->nh); - nhge->nh = NULL; - nhg->num_nh--; + list_del_init(&nhge->nh_list); } } @@ -1085,6 +1099,7 @@ static struct nexthop *nexthop_create_group(struct net *net, { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); + u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int i; @@ -1095,12 +1110,21 @@ static struct nexthop *nexthop_create_group(struct net *net, nh->is_group = 1; - nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); + nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } + /* spare group used for removals */ + nhg->spare = nexthop_grp_alloc(num_nh); + if (!nhg) { + kfree(nhg); + kfree(nh); + return NULL; + } + nhg->spare->spare = nhg; + for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; @@ -1132,6 +1156,7 @@ static struct nexthop *nexthop_create_group(struct net *net, for (; i >= 0; --i) nexthop_put(nhg->nh_entries[i].nh); + kfree(nhg->spare); kfree(nhg); kfree(nh); From 0b5e2e39739e861fa5fc84ab27a35dbe62a15330 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 May 2020 12:56:16 -0600 Subject: [PATCH 29/77] nexthop: Expand nexthop_is_multipath in a few places I got too fancy consolidating checks on multipath type. The result is that path lookups can access 2 different nh_grp structs as exposed by Nik's torture tests. Expand nexthop_is_multipath within nexthop.h to avoid multiple, nh_grp dereferences and make decisions based on the consistent struct. Only 2 places left using nexthop_is_multipath are within IPv6, both only check that the nexthop is a multipath for a branching decision which are acceptable. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: David Ahern Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/nexthop.h | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 8a343519ed7a1..f09e8d7d98869 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -137,21 +137,20 @@ static inline unsigned int nexthop_num_path(const struct nexthop *nh) { unsigned int rc = 1; - if (nexthop_is_multipath(nh)) { + if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); - rc = nh_grp->num_nh; + if (nh_grp->mpath) + rc = nh_grp->num_nh; } return rc; } static inline -struct nexthop *nexthop_mpath_select(const struct nexthop *nh, int nhsel) +struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel) { - const struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); - /* for_nexthops macros in fib_semantics.c grabs a pointer to * the nexthop before checking nhsel */ @@ -186,12 +185,14 @@ static inline bool nexthop_is_blackhole(const struct nexthop *nh) { const struct nh_info *nhi; - if (nexthop_is_multipath(nh)) { - if (nexthop_num_path(nh) > 1) - return false; - nh = nexthop_mpath_select(nh, 0); - if (!nh) + if (nh->is_group) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference_rtnl(nh->nh_grp); + if (nh_grp->num_nh > 1) return false; + + nh = nh_grp->nh_entries[0].nh; } nhi = rcu_dereference_rtnl(nh->nh_info); @@ -217,10 +218,15 @@ struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel) BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0); BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0); - if (nexthop_is_multipath(nh)) { - nh = nexthop_mpath_select(nh, nhsel); - if (!nh) - return NULL; + if (nh->is_group) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference_rtnl(nh->nh_grp); + if (nh_grp->mpath) { + nh = nexthop_mpath_select(nh_grp, nhsel); + if (!nh) + return NULL; + } } nhi = rcu_dereference_rtnl(nh->nh_info); @@ -264,8 +270,11 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) { struct nh_info *nhi; - if (nexthop_is_multipath(nh)) { - nh = nexthop_mpath_select(nh, 0); + if (nh->is_group) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference_rtnl(nh->nh_grp); + nh = nexthop_mpath_select(nh_grp, 0); if (!nh) return NULL; } From af7888ad9edbd8ba7f6449d1c27ce281ad4b26fd Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 May 2020 12:56:17 -0600 Subject: [PATCH 30/77] ipv4: Refactor nhc evaluation in fib_table_lookup FIB lookups can return an entry that references an external nexthop. While walking the nexthop struct we do not want to make multiple calls into the nexthop code which can result in 2 different structs getting accessed - one returning the number of paths the rest of the loop seeing a different nh_grp struct. If the nexthop group shrunk, the result is an attempt to access a fib_nh_common that does not exist for the new nh_grp struct but did for the old one. To fix that move the device evaluation code to a helper that can be used for inline fib_nh path as well as external nexthops. Update the existing check for fi->nh in fib_table_lookup to call a new helper, nexthop_get_nhc_lookup, which walks the external nexthop with a single rcu dereference. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: David Ahern Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 ++ include/net/nexthop.h | 33 ++++++++++++++++++++++++++++ net/ipv4/fib_trie.c | 51 ++++++++++++++++++++++++++++++------------- 3 files changed, 71 insertions(+), 15 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b219a8fe0950b..771ce068bc966 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -479,6 +479,8 @@ void fib_nh_common_release(struct fib_nh_common *nhc); void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); +bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, + const struct flowi4 *flp); static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) { diff --git a/include/net/nexthop.h b/include/net/nexthop.h index f09e8d7d98869..9414ae46fc1c0 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -233,6 +233,39 @@ struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel) return &nhi->fib_nhc; } +/* called from fib_table_lookup with rcu_lock */ +static inline +struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh, + int fib_flags, + const struct flowi4 *flp, + int *nhsel) +{ + struct nh_info *nhi; + + if (nh->is_group) { + struct nh_group *nhg = rcu_dereference(nh->nh_grp); + int i; + + for (i = 0; i < nhg->num_nh; i++) { + struct nexthop *nhe = nhg->nh_entries[i].nh; + + nhi = rcu_dereference(nhe->nh_info); + if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) { + *nhsel = i; + return &nhi->fib_nhc; + } + } + } else { + nhi = rcu_dereference(nh->nh_info); + if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) { + *nhsel = 0; + return &nhi->fib_nhc; + } + } + + return NULL; +} + static inline unsigned int fib_info_num_path(const struct fib_info *fi) { if (unlikely(fi->nh)) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4f334b4255385..248f1c1959a63 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1371,6 +1371,26 @@ static inline t_key prefix_mismatch(t_key key, struct key_vector *n) return (key ^ prefix) & (prefix | -prefix); } +bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, + const struct flowi4 *flp) +{ + if (nhc->nhc_flags & RTNH_F_DEAD) + return false; + + if (ip_ignore_linkdown(nhc->nhc_dev) && + nhc->nhc_flags & RTNH_F_LINKDOWN && + !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) + return false; + + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { + if (flp->flowi4_oif && + flp->flowi4_oif != nhc->nhc_oif) + return false; + } + + return true; +} + /* should be called with rcu_read_lock */ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) @@ -1503,6 +1523,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, /* Step 3: Process the leaf, if that fails fall back to backtracing */ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; + struct fib_nh_common *nhc; int nhsel, err; if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { @@ -1528,26 +1549,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, if (fi->fib_flags & RTNH_F_DEAD) continue; - if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) { - err = fib_props[RTN_BLACKHOLE].error; - goto out_reject; + if (unlikely(fi->nh)) { + if (nexthop_is_blackhole(fi->nh)) { + err = fib_props[RTN_BLACKHOLE].error; + goto out_reject; + } + + nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp, + &nhsel); + if (nhc) + goto set_result; + goto miss; } for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { - struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); + nhc = fib_info_nhc(fi, nhsel); - if (nhc->nhc_flags & RTNH_F_DEAD) + if (!fib_lookup_good_nhc(nhc, fib_flags, flp)) continue; - if (ip_ignore_linkdown(nhc->nhc_dev) && - nhc->nhc_flags & RTNH_F_LINKDOWN && - !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) - continue; - if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { - if (flp->flowi4_oif && - flp->flowi4_oif != nhc->nhc_oif) - continue; - } - +set_result: if (!(fib_flags & FIB_LOOKUP_NOREF)) refcount_inc(&fi->fib_clntref); @@ -1568,6 +1588,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, return err; } } +miss: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_miss); #endif From 1fd1c768f3624a5e66766e7b4ddb9b607cd834a5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 May 2020 12:56:18 -0600 Subject: [PATCH 31/77] ipv4: nexthop version of fib_info_nh_uses_dev Similar to the last path, need to fix fib_info_nh_uses_dev for external nexthops to avoid referencing multiple nh_grp structs. Move the device check in fib_info_nh_uses_dev to a helper and create a nexthop version that is called if the fib_info uses an external nexthop. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: David Ahern Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 10 ++++++++++ include/net/nexthop.h | 25 +++++++++++++++++++++++++ net/ipv4/fib_frontend.c | 19 ++++++++++--------- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 771ce068bc966..2ec062aaa9782 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -447,6 +447,16 @@ static inline int fib_num_tclassid_users(struct net *net) #endif int fib_unmerge(struct net *net); +static inline bool nhc_l3mdev_matches_dev(const struct fib_nh_common *nhc, +const struct net_device *dev) +{ + if (nhc->nhc_dev == dev || + l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) + return true; + + return false; +} + /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 9414ae46fc1c0..8c9f1a7188591 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -266,6 +266,31 @@ struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh, return NULL; } +static inline bool nexthop_uses_dev(const struct nexthop *nh, + const struct net_device *dev) +{ + struct nh_info *nhi; + + if (nh->is_group) { + struct nh_group *nhg = rcu_dereference(nh->nh_grp); + int i; + + for (i = 0; i < nhg->num_nh; i++) { + struct nexthop *nhe = nhg->nh_entries[i].nh; + + nhi = rcu_dereference(nhe->nh_info); + if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev)) + return true; + } + } else { + nhi = rcu_dereference(nh->nh_info); + if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev)) + return true; + } + + return false; +} + static inline unsigned int fib_info_num_path(const struct fib_info *fi) { if (unlikely(fi->nh)) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1bf9da3a75f92..41079490a1181 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -309,17 +309,18 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) { bool dev_match = false; #ifdef CONFIG_IP_ROUTE_MULTIPATH - int ret; + if (unlikely(fi->nh)) { + dev_match = nexthop_uses_dev(fi->nh, dev); + } else { + int ret; - for (ret = 0; ret < fib_info_num_path(fi); ret++) { - const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); + for (ret = 0; ret < fib_info_num_path(fi); ret++) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); - if (nhc->nhc_dev == dev) { - dev_match = true; - break; - } else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) { - dev_match = true; - break; + if (nhc_l3mdev_matches_dev(nhc, dev)) { + dev_match = true; + break; + } } } #else From f2fb6b6275eba9d312957ca44c487bd780da6169 Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 25 May 2020 16:18:14 +0800 Subject: [PATCH 32/77] net: stmmac: enable timestamp snapshot for required PTP packets in dwmac v5.10a MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For rx filter 'HWTSTAMP_FILTER_PTP_V2_EVENT', it should be PTP v2/802.AS1, any layer, any kind of event packet, but HW only take timestamp snapshot for below PTP message: sync, Pdelay_req, Pdelay_resp. Then it causes below issue when test E2E case: ptp4l[2479.534]: port 1: received DELAY_REQ without timestamp ptp4l[2481.423]: port 1: received DELAY_REQ without timestamp ptp4l[2481.758]: port 1: received DELAY_REQ without timestamp ptp4l[2483.524]: port 1: received DELAY_REQ without timestamp ptp4l[2484.233]: port 1: received DELAY_REQ without timestamp ptp4l[2485.750]: port 1: received DELAY_REQ without timestamp ptp4l[2486.888]: port 1: received DELAY_REQ without timestamp ptp4l[2487.265]: port 1: received DELAY_REQ without timestamp ptp4l[2487.316]: port 1: received DELAY_REQ without timestamp Timestamp snapshot dependency on register bits in received path: SNAPTYPSEL TSMSTRENA TSEVNTENA PTP_Messages 01 x 0 SYNC, Follow_Up, Delay_Req, Delay_Resp, Pdelay_Req, Pdelay_Resp, Pdelay_Resp_Follow_Up 01 0 1 SYNC, Pdelay_Req, Pdelay_Resp For dwmac v5.10a, enabling all events by setting register DWC_EQOS_TIME_STAMPING[SNAPTYPSEL] to 2’b01, clearing bit [TSEVNTENA] to 0’b0, which can support all required events. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 1f319c9cee468..7e9cbfd235308 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -630,7 +630,8 @@ static int stmmac_hwtstamp_set(struct net_device *dev, struct ifreq *ifr) config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; ptp_v2 = PTP_TCR_TSVER2ENA; snap_type_sel = PTP_TCR_SNAPTYPSEL_1; - ts_event_en = PTP_TCR_TSEVNTENA; + if (priv->synopsys_id != DWMAC_CORE_5_10) + ts_event_en = PTP_TCR_TSEVNTENA; ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA; ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA; ptp_over_ethernet = PTP_TCR_TSIPENA; From 0a82e230c68860b7286dad8644d9d9f7cfd755d2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 25 May 2020 16:38:47 +0200 Subject: [PATCH 33/77] mptcp: avoid NULL-ptr derefence on fallback In the MPTCP receive path we must cope with TCP fallback on blocking recvmsg(). Currently in such code path we detect the fallback condition, but we don't fetch the struct socket required for fallback. The above allowed syzkaller to trigger a NULL pointer dereference: general protection fault, probably for non-canonical address 0xdffffc0000000004: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000020-0x0000000000000027] CPU: 1 PID: 7226 Comm: syz-executor523 Not tainted 5.7.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:sock_recvmsg_nosec net/socket.c:886 [inline] RIP: 0010:sock_recvmsg+0x92/0x110 net/socket.c:904 Code: 5b 41 5c 41 5d 41 5e 41 5f 5d c3 44 89 6c 24 04 e8 53 18 1d fb 4d 8d 6f 20 4c 89 e8 48 c1 e8 03 48 b9 00 00 00 00 00 fc ff df <80> 3c 08 00 74 08 4c 89 ef e8 20 12 5b fb bd a0 00 00 00 49 03 6d RSP: 0018:ffffc90001077b98 EFLAGS: 00010202 RAX: 0000000000000004 RBX: ffffc90001077dc0 RCX: dffffc0000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffffffff86565e59 R09: ffffed10115afeaa R10: ffffed10115afeaa R11: 0000000000000000 R12: 1ffff9200020efbc R13: 0000000000000020 R14: ffffc90001077de0 R15: 0000000000000000 FS: 00007fc6a3abe700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004d0050 CR3: 00000000969f0000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mptcp_recvmsg+0x18d5/0x19b0 net/mptcp/protocol.c:891 inet_recvmsg+0xf6/0x1d0 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:886 [inline] sock_recvmsg net/socket.c:904 [inline] __sys_recvfrom+0x2f3/0x470 net/socket.c:2057 __do_sys_recvfrom net/socket.c:2075 [inline] __se_sys_recvfrom net/socket.c:2071 [inline] __x64_sys_recvfrom+0xda/0xf0 net/socket.c:2071 do_syscall_64+0xf3/0x1b0 arch/x86/entry/common.c:295 entry_SYSCALL_64_after_hwframe+0x49/0xb3 Address the issue initializing the struct socket reference before entering the fallback code. Reported-and-tested-by: syzbot+c6bfc3db991edc918432@syzkaller.appspotmail.com Suggested-by: Ondrej Mosnacek Fixes: 8ab183deb26a ("mptcp: cope with later TCP fallback") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 32ea8d35489a5..c8675d2eb5b97 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -954,7 +954,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, pr_debug("block timeout %ld", timeo); mptcp_wait_data(sk, &timeo); - if (unlikely(__mptcp_tcp_fallback(msk))) + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) goto fallback; } From 6dd912f82680761d8fb6b1bb274a69d4c7010988 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 25 May 2020 15:07:40 -0400 Subject: [PATCH 34/77] net: check untrusted gso_size at kernel entry Syzkaller again found a path to a kernel crash through bad gso input: a packet with gso size exceeding len. These packets are dropped in tcp_gso_segment and udp[46]_ufo_fragment. But they may affect gso size calculations earlier in the path. Now that we have thlen as of commit 9274124f023b ("net: stricter validation of untrusted gso packets"), check gso_size at entry too. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Reported-by: syzbot Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 6f6ade63b04cd..88997022a4b5b 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -31,6 +31,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, { unsigned int gso_type = 0; unsigned int thlen = 0; + unsigned int p_off = 0; unsigned int ip_proto; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -68,7 +69,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; - if (skb_transport_offset(skb) + thlen > skb_headlen(skb)) + p_off = skb_transport_offset(skb) + thlen; + if (p_off > skb_headlen(skb)) return -EINVAL; } else { /* gso packets without NEEDS_CSUM do not set transport_offset. @@ -92,17 +94,25 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, return -EINVAL; } - if (keys.control.thoff + thlen > skb_headlen(skb) || + p_off = keys.control.thoff + thlen; + if (p_off > skb_headlen(skb) || keys.basic.ip_proto != ip_proto) return -EINVAL; skb_set_transport_header(skb, keys.control.thoff); + } else if (gso_type) { + p_off = thlen; + if (p_off > skb_headlen(skb)) + return -EINVAL; } } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); + if (skb->len - p_off <= gso_size) + return -EINVAL; + skb_shinfo(skb)->gso_size = gso_size; skb_shinfo(skb)->gso_type = gso_type; From 591612aa578cd7148b7b9d74869ef40118978389 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Mon, 25 May 2020 23:25:37 +0200 Subject: [PATCH 35/77] net: usb: qmi_wwan: add Telit LE910C1-EUX composition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Telit LE910C1-EUX composition 0x1031: tty, tty, tty, rmnet Signed-off-by: Daniele Palmas Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 4bb8552a00d3d..4a2c7355be63d 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1324,6 +1324,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */ {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ {QMI_FIXED_INTF(0x2357, 0x9000, 4)}, /* TP-LINK MA260 */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x1031, 3)}, /* Telit LE910C1-EUX */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1050, 2)}, /* Telit FN980 */ {QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */ From b8056e8434b037fdab08158fea99ed7bc8ef3a74 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 25 May 2020 17:41:17 -0400 Subject: [PATCH 36/77] bnxt_en: Fix accumulation of bp->net_stats_prev. We have logic to maintain network counters across resets by storing the counters in bp->net_stats_prev before reset. But not all resets will clear the counters. Certain resets that don't need to change the number of rings do not clear the counters. The current logic accumulates the counters before all resets, causing big jumps in the counters after some resets, such as ethtool -G. Fix it by only accumulating the counters during reset if the irq_re_init parameter is set. The parameter signifies that all rings and interrupts will be reset and that means that the counters will also be reset. Reported-by: Vijayendra Suman Fixes: b8875ca356f1 ("bnxt_en: Save ring statistics before reset.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d1a83716d9345..abb203c019ce0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9310,7 +9310,7 @@ static void __bnxt_close_nic(struct bnxt *bp, bool irq_re_init, bnxt_free_skbs(bp); /* Save ring stats before shutdown */ - if (bp->bnapi) + if (bp->bnapi && irq_re_init) bnxt_get_ring_stats(bp, &bp->net_stats_prev); if (irq_re_init) { bnxt_free_irq(bp); From 95ec1f470b976858264d7635a6ef76bc33c3875b Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 25 May 2020 17:41:18 -0400 Subject: [PATCH 37/77] bnxt_en: Fix return code to "flash_device". When NVRAM directory is not found, return the error code properly as per firmware command failure instead of the hardcode -ENOBUFS. Fixes: 3a707bed13b7 ("bnxt_en: Return -EAGAIN if fw command returns BUSY") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 34046a6286e8d..360f9a95c1d50 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2012,11 +2012,12 @@ int bnxt_flash_package_from_file(struct net_device *dev, const char *filename, bnxt_hwrm_fw_set_time(bp); - if (bnxt_find_nvram_item(dev, BNX_DIR_TYPE_UPDATE, - BNX_DIR_ORDINAL_FIRST, BNX_DIR_EXT_NONE, - &index, &item_len, NULL) != 0) { + rc = bnxt_find_nvram_item(dev, BNX_DIR_TYPE_UPDATE, + BNX_DIR_ORDINAL_FIRST, BNX_DIR_EXT_NONE, + &index, &item_len, NULL); + if (rc) { netdev_err(dev, "PKG update area not created in nvram\n"); - return -ENOBUFS; + return rc; } rc = request_firmware(&fw, filename, &dev->dev); From 2a5a8800fa915bd9bc272c91ca64728e6aa84c0a Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Mon, 25 May 2020 17:41:19 -0400 Subject: [PATCH 38/77] bnxt_en: fix firmware message length endianness The explicit mask and shift is not the appropriate way to parse fields out of a little endian struct. The length field is internally __le16 and the strategy employed only happens to work on little endian machines because the offset used is actually incorrect (length is at offset 6). Also remove the related and no longer used definitions from bnxt.h. Fixes: 845adfe40c2a ("bnxt_en: Improve valid bit checking in firmware response message.") Signed-off-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 14 ++++---------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 ----- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index abb203c019ce0..58e0d9a781e9a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4176,14 +4176,12 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, int i, intr_process, rc, tmo_count; struct input *req = msg; u32 *data = msg; - __le32 *resp_len; u8 *valid; u16 cp_ring_id, len = 0; struct hwrm_err_output *resp = bp->hwrm_cmd_resp_addr; u16 max_req_len = BNXT_HWRM_MAX_REQ_LEN; struct hwrm_short_input short_input = {0}; u32 doorbell_offset = BNXT_GRCPF_REG_CHIMP_COMM_TRIGGER; - u8 *resp_addr = (u8 *)bp->hwrm_cmd_resp_addr; u32 bar_offset = BNXT_GRCPF_REG_CHIMP_COMM; u16 dst = BNXT_HWRM_CHNL_CHIMP; @@ -4201,7 +4199,6 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, bar_offset = BNXT_GRCPF_REG_KONG_COMM; doorbell_offset = BNXT_GRCPF_REG_KONG_COMM_TRIGGER; resp = bp->hwrm_cmd_kong_resp_addr; - resp_addr = (u8 *)bp->hwrm_cmd_kong_resp_addr; } memset(resp, 0, PAGE_SIZE); @@ -4270,7 +4267,6 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, tmo_count = HWRM_SHORT_TIMEOUT_COUNTER; timeout = timeout - HWRM_SHORT_MIN_TIMEOUT * HWRM_SHORT_TIMEOUT_COUNTER; tmo_count += DIV_ROUND_UP(timeout, HWRM_MIN_TIMEOUT); - resp_len = (__le32 *)(resp_addr + HWRM_RESP_LEN_OFFSET); if (intr_process) { u16 seq_id = bp->hwrm_intr_seq_id; @@ -4298,9 +4294,8 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, le16_to_cpu(req->req_type)); return -EBUSY; } - len = (le32_to_cpu(*resp_len) & HWRM_RESP_LEN_MASK) >> - HWRM_RESP_LEN_SFT; - valid = resp_addr + len - 1; + len = le16_to_cpu(resp->resp_len); + valid = ((u8 *)resp) + len - 1; } else { int j; @@ -4311,8 +4306,7 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, */ if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) return -EBUSY; - len = (le32_to_cpu(*resp_len) & HWRM_RESP_LEN_MASK) >> - HWRM_RESP_LEN_SFT; + len = le16_to_cpu(resp->resp_len); if (len) break; /* on first few passes, just barely sleep */ @@ -4334,7 +4328,7 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, } /* Last byte of resp contains valid bit */ - valid = resp_addr + len - 1; + valid = ((u8 *)resp) + len - 1; for (j = 0; j < HWRM_VALID_BIT_DELAY_USEC; j++) { /* make sure we read from updated DMA memory */ dma_rmb(); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index f6a3250ef1c55..3d39638521d6c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -656,11 +656,6 @@ struct nqe_cn { #define HWRM_CMD_TIMEOUT (bp->hwrm_cmd_timeout) #define HWRM_RESET_TIMEOUT ((HWRM_CMD_TIMEOUT) * 4) #define HWRM_COREDUMP_TIMEOUT ((HWRM_CMD_TIMEOUT) * 12) -#define HWRM_RESP_ERR_CODE_MASK 0xffff -#define HWRM_RESP_LEN_OFFSET 4 -#define HWRM_RESP_LEN_MASK 0xffff0000 -#define HWRM_RESP_LEN_SFT 16 -#define HWRM_RESP_VALID_MASK 0xff000000 #define BNXT_HWRM_REQ_MAX_SIZE 128 #define BNXT_HWRM_REQS_PER_PAGE (BNXT_PAGE_SIZE / \ BNXT_HWRM_REQ_MAX_SIZE) From 9a233d329ec5cd2dd64737d24018c1463ad88f26 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Tue, 26 May 2020 10:55:59 +1200 Subject: [PATCH 39/77] net: sctp: Fix spelling in Kconfig help Change 'handeled' to 'handled' in the Kconfig help for SCTP. Signed-off-by: Chris Packham Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 6e2eb1dd64ed0..68934438ee19c 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -31,7 +31,7 @@ menuconfig IP_SCTP homing at either or both ends of an association." To compile this protocol support as a module, choose M here: the - module will be called sctp. Debug messages are handeled by the + module will be called sctp. Debug messages are handled by the kernel's dynamic debugging framework. If in doubt, say N. From a4976a3ef844c510ae9120290b23e9f3f47d6bce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2020 17:28:56 -0700 Subject: [PATCH 40/77] crypto: chelsio/chtls: properly set tp->lsndtime TCP tp->lsndtime unit/base is tcp_jiffies32, not tcp_time_stamp() Fixes: 36bedb3f2e5b ("crypto: chtls - Inline TLS record Tx") Signed-off-by: Eric Dumazet Cc: Ayush Sawal Cc: Vinay Kumar Yadav Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chtls/chtls_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c index dccef3a2908b3..e1401d9cc756c 100644 --- a/drivers/crypto/chelsio/chtls/chtls_io.c +++ b/drivers/crypto/chelsio/chtls/chtls_io.c @@ -682,7 +682,7 @@ int chtls_push_frames(struct chtls_sock *csk, int comp) make_tx_data_wr(sk, skb, immdlen, len, credits_needed, completion); tp->snd_nxt += len; - tp->lsndtime = tcp_time_stamp(tp); + tp->lsndtime = tcp_jiffies32; if (completion) ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_NEED_HDR; } else { From 46c1e0621a72e0469ec4edfdb6ed4d387ec34f8a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 27 May 2020 01:10:39 -0700 Subject: [PATCH 41/77] netfilter: conntrack: Pass value of ctinfo to __nf_conntrack_update Clang warns: net/netfilter/nf_conntrack_core.c:2068:21: warning: variable 'ctinfo' is uninitialized when used here [-Wuninitialized] nf_ct_set(skb, ct, ctinfo); ^~~~~~ net/netfilter/nf_conntrack_core.c:2024:2: note: variable 'ctinfo' is declared here enum ip_conntrack_info ctinfo; ^ 1 warning generated. nf_conntrack_update was split up into nf_conntrack_update and __nf_conntrack_update, where the assignment of ctinfo is in nf_conntrack_update but it is used in __nf_conntrack_update. Pass the value of ctinfo from nf_conntrack_update to __nf_conntrack_update so that uninitialized memory is not used and everything works properly. Fixes: ee04805ff54a ("netfilter: conntrack: make conntrack userspace helpers work again") Link: https://github.com/ClangBuiltLinux/linux/issues/1039 Signed-off-by: Nathan Chancellor Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 08e0c19f6b396..e3b054a2f796f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2017,11 +2017,11 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) } static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, - struct nf_conn *ct) + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - enum ip_conntrack_info ctinfo; struct nf_nat_hook *nat_hook; unsigned int status; int dataoff; @@ -2146,7 +2146,7 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; if (!nf_ct_is_confirmed(ct)) { - err = __nf_conntrack_update(net, skb, ct); + err = __nf_conntrack_update(net, skb, ct, ctinfo); if (err < 0) return err; } From 94945ad2b330207cded0fd8d4abebde43a776dfb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 27 May 2020 12:17:34 +0200 Subject: [PATCH 42/77] netfilter: conntrack: comparison of unsigned in cthelper confirmation net/netfilter/nf_conntrack_core.c: In function nf_confirm_cthelper: net/netfilter/nf_conntrack_core.c:2117:15: warning: comparison of unsigned expression in < 0 is always false [-Wtype-limits] 2117 | if (protoff < 0 || (frag_off & htons(~0x7)) != 0) | ^ ipv6_skip_exthdr() returns a signed integer. Reported-by: Colin Ian King Fixes: 703acd70f249 ("netfilter: nfnetlink_cthelper: unbreak userspace helper support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e3b054a2f796f..bb72ca5f3999a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2092,7 +2092,7 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, { const struct nf_conntrack_helper *helper; const struct nf_conn_help *help; - unsigned int protoff; + int protoff; help = nfct_help(ct); if (!help) From 4946ea5c1237036155c3b3a24f049fd5f849f8f6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 27 May 2020 12:24:10 +0200 Subject: [PATCH 43/77] netfilter: nf_conntrack_pptp: fix compilation warning with W=1 build >> include/linux/netfilter/nf_conntrack_pptp.h:13:20: warning: 'const' type qualifier on return type has no effect [-Wignored-qualifiers] extern const char *const pptp_msg_name(u_int16_t msg); ^~~~~~ Reported-by: kbuild test robot Fixes: 4c559f15efcc ("netfilter: nf_conntrack_pptp: prevent buffer overflows in debug code") Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_pptp.h | 2 +- net/netfilter/nf_conntrack_pptp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_pptp.h b/include/linux/netfilter/nf_conntrack_pptp.h index 6a4ff6d5ebc29..a28aa289afdca 100644 --- a/include/linux/netfilter/nf_conntrack_pptp.h +++ b/include/linux/netfilter/nf_conntrack_pptp.h @@ -10,7 +10,7 @@ #include #include -extern const char *const pptp_msg_name(u_int16_t msg); +const char *pptp_msg_name(u_int16_t msg); /* state of the control session */ enum pptp_ctrlsess_state { diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7ad247784cfaa..1f44d523b5121 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -91,7 +91,7 @@ static const char *const pptp_msg_name_array[PPTP_MSG_MAX + 1] = { [PPTP_SET_LINK_INFO] = "SET_LINK_INFO" }; -const char *const pptp_msg_name(u_int16_t msg) +const char *pptp_msg_name(u_int16_t msg) { if (msg > PPTP_MSG_MAX) return pptp_msg_name_array[0]; From bb2f930d6dd708469a587dc9ed1efe1ef969c0bf Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 27 May 2020 02:04:26 +0200 Subject: [PATCH 44/77] net/sched: fix infinite loop in sch_fq_pie this command hangs forever: # tc qdisc add dev eth0 root fq_pie flows 65536 watchdog: BUG: soft lockup - CPU#1 stuck for 23s! [tc:1028] [...] CPU: 1 PID: 1028 Comm: tc Not tainted 5.7.0-rc6+ #167 RIP: 0010:fq_pie_init+0x60e/0x8b7 [sch_fq_pie] Code: 4c 89 65 50 48 89 f8 48 c1 e8 03 42 80 3c 30 00 0f 85 2a 02 00 00 48 8d 7d 10 4c 89 65 58 48 89 f8 48 c1 e8 03 42 80 3c 30 00 <0f> 85 a7 01 00 00 48 8d 7d 18 48 c7 45 10 46 c3 23 00 48 89 f8 48 RSP: 0018:ffff888138d67468 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: 1ffff9200018d2b2 RBX: ffff888139c1c400 RCX: ffffffffffffffff RDX: 000000000000c5e8 RSI: ffffc900000e5000 RDI: ffffc90000c69590 RBP: ffffc90000c69580 R08: fffffbfff79a9699 R09: fffffbfff79a9699 R10: 0000000000000700 R11: fffffbfff79a9698 R12: ffffc90000c695d0 R13: 0000000000000000 R14: dffffc0000000000 R15: 000000002347c5e8 FS: 00007f01e1850e40(0000) GS:ffff88814c880000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000067c340 CR3: 000000013864c000 CR4: 0000000000340ee0 Call Trace: qdisc_create+0x3fd/0xeb0 tc_modify_qdisc+0x3be/0x14a0 rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x121/0x350 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 we can't accept 65536 as a valid number for 'nflows', because the loop on 'idx' in fq_pie_init() will never end. The extack message is correct, but it doesn't say that 0 is not a valid number for 'flows': while at it, fix this also. Add a tdc selftest to check correct validation of 'flows'. CC: Ivan Vecera Fixes: ec97ecf1ebe4 ("net: sched: add Flow Queue PIE packet scheduler") Signed-off-by: Davide Caratti Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller --- net/sched/sch_fq_pie.c | 4 ++-- .../tc-testing/tc-tests/qdiscs/fq_pie.json | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index a9da8776bf5b5..fb760cee824e4 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -297,9 +297,9 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, goto flow_error; } q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]); - if (!q->flows_cnt || q->flows_cnt > 65536) { + if (!q->flows_cnt || q->flows_cnt >= 65536) { NL_SET_ERR_MSG_MOD(extack, - "Number of flows must be < 65536"); + "Number of flows must range in [1..65535]"); goto flow_error; } } diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json new file mode 100644 index 0000000000000..1cda2e11b3ad9 --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json @@ -0,0 +1,21 @@ +[ + { + "id": "83be", + "name": "Create FQ-PIE with invalid number of flows", + "category": [ + "qdisc", + "fq_pie" + ], + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY root fq_pie flows 65536", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc", + "matchCount": "0", + "teardown": [ + "$IP link del dev $DUMMY" + ] + } +] From 5b186cd60f033110960a3db424ffbd6de4cee528 Mon Sep 17 00:00:00 2001 From: Heinrich Kuhn Date: Wed, 27 May 2020 09:44:20 +0200 Subject: [PATCH 45/77] nfp: flower: fix used time of merge flow statistics Prior to this change the correct value for the used counter is calculated but not stored nor, therefore, propagated to user-space. In use-cases such as OVS use-case at least this results in active flows being removed from the hardware datapath. Which results in both unnecessary flow tear-down and setup, and packet processing on the host. This patch addresses the problem by saving the calculated used value which allows the value to propagate to user-space. Found by inspection. Fixes: aa6ce2ea0c93 ("nfp: flower: support stats update for merge flows") Signed-off-by: Heinrich Kuhn Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index c694dbc239d01..6b60771ccb195 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -1440,7 +1440,8 @@ __nfp_flower_update_merge_stats(struct nfp_app *app, ctx_id = be32_to_cpu(sub_flow->meta.host_ctx_id); priv->stats[ctx_id].pkts += pkts; priv->stats[ctx_id].bytes += bytes; - max_t(u64, priv->stats[ctx_id].used, used); + priv->stats[ctx_id].used = max_t(u64, used, + priv->stats[ctx_id].used); } } From 7e0afbdfd13d1e708fe96e31c46c4897101a6a43 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 27 May 2020 09:56:55 +0200 Subject: [PATCH 46/77] vsock: fix timeout in vsock_accept() The accept(2) is an "input" socket interface, so we should use SO_RCVTIMEO instead of SO_SNDTIMEO to set the timeout. So this patch replace sock_sndtimeo() with sock_rcvtimeo() to use the right timeout in the vsock_accept(). Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Stefano Garzarella Reviewed-by: Jorgen Hansen Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index a5f28708e0e75..626bf9044418c 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1408,7 +1408,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, /* Wait for children sockets to appear; these are the new sockets * created upon connection establishment. */ - timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); + timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK); prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); while ((connected = vsock_dequeue_accept(listener)) == NULL && From b3b6a84c6a920c60fd1393c43818b3955441424b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 27 May 2020 15:51:13 +0200 Subject: [PATCH 47/77] bridge: multicast: work around clang bug Clang-10 and clang-11 run into a corner case of the register allocator on 32-bit ARM, leading to excessive stack usage from register spilling: net/bridge/br_multicast.c:2422:6: error: stack frame size of 1472 bytes in function 'br_multicast_get_stats' [-Werror,-Wframe-larger-than=] Work around this by marking one of the internal functions as noinline_for_stack. Link: https://bugs.llvm.org/show_bug.cgi?id=45802#c9 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index ad12fe3fca8cf..83490bf73a13b 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2413,7 +2413,8 @@ void br_multicast_uninit_stats(struct net_bridge *br) free_percpu(br->mcast_stats); } -static void mcast_stats_add_dir(u64 *dst, u64 *src) +/* noinline for https://bugs.llvm.org/show_bug.cgi?id=45802#c9 */ +static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src) { dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX]; From 183be6f967fe37c3154bfac39e913c3bafe89d1b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 27 May 2020 19:48:03 +0300 Subject: [PATCH 48/77] net: dsa: felix: send VLANs on CPU port as egress-tagged As explained in other commits before (b9cd75e66895 and 87b0f983f66f), ocelot switches have a single egress-untagged VLAN per port, and the driver would deny adding a second one while an egress-untagged VLAN already exists. But on the CPU port (where the VLAN configuration is implicit, because there is no net device for the bridge to control), the DSA core attempts to add a VLAN using the same flags as were used for the front-panel port. This would make adding any untagged VLAN fail due to the CPU port rejecting the configuration: bridge vlan add dev swp0 vid 100 pvid untagged [ 1865.854253] mscc_felix 0000:00:00.5: Port already has a native VLAN: 1 [ 1865.860824] mscc_felix 0000:00:00.5: Failed to add VLAN 100 to port 5: -16 (note that port 5 is the CPU port and not the front-panel swp0). So this hardware will send all VLANs as tagged towards the CPU. Fixes: 56051948773e ("net: dsa: ocelot: add driver for Felix switch family") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index e8aae64db1caa..e113269c220a0 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -102,13 +102,17 @@ static void felix_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct ocelot *ocelot = ds->priv; + u16 flags = vlan->flags; u16 vid; int err; + if (dsa_is_cpu_port(ds, port)) + flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; + for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { err = ocelot_vlan_add(ocelot, port, vid, - vlan->flags & BRIDGE_VLAN_INFO_PVID, - vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED); + flags & BRIDGE_VLAN_INFO_PVID, + flags & BRIDGE_VLAN_INFO_UNTAGGED); if (err) { dev_err(ds->dev, "Failed to add VLAN %d to port %d: %d\n", vid, port, err); From 2b86cb8299765688c5119fd18d5f436716c81010 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 27 May 2020 21:08:05 +0300 Subject: [PATCH 49/77] net: dsa: declare lockless TX feature for slave ports Be there a platform with the following layout: Regular NIC | +----> DSA master for switch port | +----> DSA master for another switch port After changing DSA back to static lockdep class keys in commit 1a33e10e4a95 ("net: partially revert dynamic lockdep key changes"), this kernel splat can be seen: [ 13.361198] ============================================ [ 13.366524] WARNING: possible recursive locking detected [ 13.371851] 5.7.0-rc4-02121-gc32a05ecd7af-dirty #988 Not tainted [ 13.377874] -------------------------------------------- [ 13.383201] swapper/0/0 is trying to acquire lock: [ 13.388004] ffff0000668ff298 (&dsa_slave_netdev_xmit_lock_key){+.-.}-{2:2}, at: __dev_queue_xmit+0x84c/0xbe0 [ 13.397879] [ 13.397879] but task is already holding lock: [ 13.403727] ffff0000661a1698 (&dsa_slave_netdev_xmit_lock_key){+.-.}-{2:2}, at: __dev_queue_xmit+0x84c/0xbe0 [ 13.413593] [ 13.413593] other info that might help us debug this: [ 13.420140] Possible unsafe locking scenario: [ 13.420140] [ 13.426075] CPU0 [ 13.428523] ---- [ 13.430969] lock(&dsa_slave_netdev_xmit_lock_key); [ 13.435946] lock(&dsa_slave_netdev_xmit_lock_key); [ 13.440924] [ 13.440924] *** DEADLOCK *** [ 13.440924] [ 13.446860] May be due to missing lock nesting notation [ 13.446860] [ 13.453668] 6 locks held by swapper/0/0: [ 13.457598] #0: ffff800010003de0 ((&idev->mc_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0x0/0x400 [ 13.466593] #1: ffffd4d3fb478700 (rcu_read_lock){....}-{1:2}, at: mld_sendpack+0x0/0x560 [ 13.474803] #2: ffffd4d3fb478728 (rcu_read_lock_bh){....}-{1:2}, at: ip6_finish_output2+0x64/0xb10 [ 13.483886] #3: ffffd4d3fb478728 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x6c/0xbe0 [ 13.492793] #4: ffff0000661a1698 (&dsa_slave_netdev_xmit_lock_key){+.-.}-{2:2}, at: __dev_queue_xmit+0x84c/0xbe0 [ 13.503094] #5: ffffd4d3fb478728 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x6c/0xbe0 [ 13.512000] [ 13.512000] stack backtrace: [ 13.516369] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0-rc4-02121-gc32a05ecd7af-dirty #988 [ 13.530421] Call trace: [ 13.532871] dump_backtrace+0x0/0x1d8 [ 13.536539] show_stack+0x24/0x30 [ 13.539862] dump_stack+0xe8/0x150 [ 13.543271] __lock_acquire+0x1030/0x1678 [ 13.547290] lock_acquire+0xf8/0x458 [ 13.550873] _raw_spin_lock+0x44/0x58 [ 13.554543] __dev_queue_xmit+0x84c/0xbe0 [ 13.558562] dev_queue_xmit+0x24/0x30 [ 13.562232] dsa_slave_xmit+0xe0/0x128 [ 13.565988] dev_hard_start_xmit+0xf4/0x448 [ 13.570182] __dev_queue_xmit+0x808/0xbe0 [ 13.574200] dev_queue_xmit+0x24/0x30 [ 13.577869] neigh_resolve_output+0x15c/0x220 [ 13.582237] ip6_finish_output2+0x244/0xb10 [ 13.586430] __ip6_finish_output+0x1dc/0x298 [ 13.590709] ip6_output+0x84/0x358 [ 13.594116] mld_sendpack+0x2bc/0x560 [ 13.597786] mld_ifc_timer_expire+0x210/0x390 [ 13.602153] call_timer_fn+0xcc/0x400 [ 13.605822] run_timer_softirq+0x588/0x6e0 [ 13.609927] __do_softirq+0x118/0x590 [ 13.613597] irq_exit+0x13c/0x148 [ 13.616918] __handle_domain_irq+0x6c/0xc0 [ 13.621023] gic_handle_irq+0x6c/0x160 [ 13.624779] el1_irq+0xbc/0x180 [ 13.627927] cpuidle_enter_state+0xb4/0x4d0 [ 13.632120] cpuidle_enter+0x3c/0x50 [ 13.635703] call_cpuidle+0x44/0x78 [ 13.639199] do_idle+0x228/0x2c8 [ 13.642433] cpu_startup_entry+0x2c/0x48 [ 13.646363] rest_init+0x1ac/0x280 [ 13.649773] arch_call_rest_init+0x14/0x1c [ 13.653878] start_kernel+0x490/0x4bc Lockdep keys themselves were added in commit ab92d68fc22f ("net: core: add generic lockdep keys"), and it's very likely that this splat existed since then, but I have no real way to check, since this stacked platform wasn't supported by mainline back then. >From Taehee's own words: This patch was considered that all stackable devices have LLTX flag. But the dsa doesn't have LLTX, so this splat happened. After this patch, dsa shares the same lockdep class key. On the nested dsa interface architecture, which you illustrated, the same lockdep class key will be used in __dev_queue_xmit() because dsa doesn't have LLTX. So that lockdep detects deadlock because the same lockdep class key is used recursively although actually the different locks are used. There are some ways to fix this problem. 1. using NETIF_F_LLTX flag. If possible, using the LLTX flag is a very clear way for it. But I'm so sorry I don't know whether the dsa could have LLTX or not. 2. using dynamic lockdep again. It means that each interface uses a separate lockdep class key. So, lockdep will not detect recursive locking. But this way has a problem that it could consume lockdep class key too many. Currently, lockdep can have 8192 lockdep class keys. - you can see this number with the following command. cat /proc/lockdep_stats lock-classes: 1251 [max: 8192] ... The [max: 8192] means that the maximum number of lockdep class keys. If too many lockdep class keys are registered, lockdep stops to work. So, using a dynamic(separated) lockdep class key should be considered carefully. In addition, updating lockdep class key routine might have to be existing. (lockdep_register_key(), lockdep_set_class(), lockdep_unregister_key()) 3. Using lockdep subclass. A lockdep class key could have 8 subclasses. The different subclass is considered different locks by lockdep infrastructure. But "lock-classes" is not counted by subclasses. So, it could avoid stopping lockdep infrastructure by an overflow of lockdep class keys. This approach should also have an updating lockdep class key routine. (lockdep_set_subclass()) 4. Using nonvalidate lockdep class key. The lockdep infrastructure supports nonvalidate lockdep class key type. It means this lockdep is not validated by lockdep infrastructure. So, the splat will not happen but lockdep couldn't detect real deadlock case because lockdep really doesn't validate it. I think this should be used for really special cases. (lockdep_set_novalidate_class()) Further discussion here: https://patchwork.ozlabs.org/project/netdev/patch/20200503052220.4536-2-xiyou.wangcong@gmail.com/ There appears to be no negative side-effect to declaring lockless TX for the DSA virtual interfaces, which means they handle their own locking. So that's what we do to make the splat go away. Patch tested in a wide variety of cases: unicast, multicast, PTP, etc. Fixes: ab92d68fc22f ("net: core: add generic lockdep keys") Suggested-by: Taehee Yoo Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 62f4ee3da172a..d3bcb9afa795a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1736,6 +1736,7 @@ int dsa_slave_create(struct dsa_port *port) if (ds->ops->port_vlan_add && ds->ops->port_vlan_del) slave_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; slave_dev->hw_features |= NETIF_F_HW_TC; + slave_dev->features |= NETIF_F_LLTX; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; if (!IS_ERR_OR_NULL(port->mac)) ether_addr_copy(slave_dev->dev_addr, port->mac); From d195b1d1d1196681ac4775e0361e9cca70f740c2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 May 2020 14:28:44 +0200 Subject: [PATCH 50/77] powerpc/bpf: Enable bpf_probe_read{, str}() on powerpc again The commit 0ebeea8ca8a4d1d453a ("bpf: Restrict bpf_probe_read{, str}() only to archs where they work") caused that bpf_probe_read{, str}() functions were not longer available on architectures where the same logical address might have different content in kernel and user memory mapping. These architectures should use probe_read_{user,kernel}_str helpers. For backward compatibility, the problematic functions are still available on architectures where the user and kernel address spaces are not overlapping. This is defined CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE. At the moment, these backward compatible functions are enabled only on x86_64, arm, and arm64. Let's do it also on powerpc that has the non overlapping address space as well. Fixes: 0ebeea8ca8a4 ("bpf: Restrict bpf_probe_read{, str}() only to archs where they work") Signed-off-by: Petr Mladek Signed-off-by: Daniel Borkmann Acked-by: Michael Ellerman Link: https://lore.kernel.org/lkml/20200527122844.19524-1-pmladek@suse.com --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 924c541a92600..cb042a4ff19ae 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -126,6 +126,7 @@ config PPC select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS From a068aab42258e25094bc2c159948d263ed7d7a77 Mon Sep 17 00:00:00 2001 From: Qiushi Wu Date: Wed, 27 May 2020 22:10:29 -0500 Subject: [PATCH 51/77] bonding: Fix reference count leak in bond_sysfs_slave_add. kobject_init_and_add() takes reference even when it fails. If this function returns an error, kobject_put() must be called to properly clean up the memory associated with the object. Previous commit "b8eb718348b8" fixed a similar problem. Fixes: 07699f9a7c8d ("bonding: add sysfs /slave dir for bond slave devices.") Signed-off-by: Qiushi Wu Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_sysfs_slave.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c index 007481557191f..9b8346638f697 100644 --- a/drivers/net/bonding/bond_sysfs_slave.c +++ b/drivers/net/bonding/bond_sysfs_slave.c @@ -149,8 +149,10 @@ int bond_sysfs_slave_add(struct slave *slave) err = kobject_init_and_add(&slave->kobj, &slave_ktype, &(slave->dev->dev.kobj), "bonding_slave"); - if (err) + if (err) { + kobject_put(&slave->kobj); return err; + } for (a = slave_attrs; *a; ++a) { err = sysfs_create_file(&slave->kobj, &((*a)->attr)); From 45ebf73ebcec88a34a778f5feaa0b82b1c76069e Mon Sep 17 00:00:00 2001 From: Jonas Falkevik Date: Wed, 27 May 2020 11:56:40 +0200 Subject: [PATCH 52/77] sctp: check assoc before SCTP_ADDR_{MADE_PRIM, ADDED} event Make sure SCTP_ADDR_{MADE_PRIM,ADDED} are sent only for associations that have been established. These events are described in rfc6458#section-6.1 SCTP_PEER_ADDR_CHANGE: This tag indicates that an address that is part of an existing association has experienced a change of state (e.g., a failure or return to service of the reachability of an endpoint via a specific transport address). Signed-off-by: Jonas Falkevik Acked-by: Marcelo Ricardo Leitner Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/ulpevent.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index c82dbdcf13f2f..77d5c36a8991c 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -343,6 +343,9 @@ void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, struct sockaddr_storage addr; struct sctp_ulpevent *event; + if (asoc->state < SCTP_STATE_ESTABLISHED) + return; + memset(&addr, 0, sizeof(struct sockaddr_storage)); memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); From 7c6d2ecbda83150b2036a2b36b21381ad4667762 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 May 2020 14:57:47 -0700 Subject: [PATCH 53/77] net: be more gentle about silly gso requests coming from user Recent change in virtio_net_hdr_to_skb() broke some packetdrill tests. When --mss=XXX option is set, packetdrill always provide gso_type & gso_size for its inbound packets, regardless of packet size. if (packet->tcp && packet->mss) { if (packet->ipv4) gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; gso.gso_size = packet->mss; } Since many other programs could do the same, relax virtio_net_hdr_to_skb() to no longer return an error, but instead ignore gso settings. This keeps Willem intent to make sure no malicious packet could reach gso stack. Note that TCP stack has a special logic in tcp_set_skb_tso_segs() to clear gso_size for small packets. Fixes: 6dd912f82680 ("net: check untrusted gso_size at kernel entry") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 88997022a4b5b..e8a924eeea3d0 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -109,16 +109,17 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); + struct skb_shared_info *shinfo = skb_shinfo(skb); - if (skb->len - p_off <= gso_size) - return -EINVAL; - - skb_shinfo(skb)->gso_size = gso_size; - skb_shinfo(skb)->gso_type = gso_type; + /* Too small packets are not really GSO ones. */ + if (skb->len - p_off > gso_size) { + shinfo->gso_size = gso_size; + shinfo->gso_type = gso_type; - /* Header must be checked, and gso_segs computed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + /* Header must be checked, and gso_segs computed. */ + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; + } } return 0; From f6a23d85d078c2ffde79c66ca81d0a1dde451649 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 26 May 2020 17:41:46 +0800 Subject: [PATCH 54/77] xfrm: fix a NULL-ptr deref in xfrm_local_error This patch is to fix a crash: [ ] kasan: GPF could be caused by NULL-ptr deref or user memory access [ ] general protection fault: 0000 [#1] SMP KASAN PTI [ ] RIP: 0010:ipv6_local_error+0xac/0x7a0 [ ] Call Trace: [ ] xfrm6_local_error+0x1eb/0x300 [ ] xfrm_local_error+0x95/0x130 [ ] __xfrm6_output+0x65f/0xb50 [ ] xfrm6_output+0x106/0x46f [ ] udp_tunnel6_xmit_skb+0x618/0xbf0 [ip6_udp_tunnel] [ ] vxlan_xmit_one+0xbc6/0x2c60 [vxlan] [ ] vxlan_xmit+0x6a0/0x4276 [vxlan] [ ] dev_hard_start_xmit+0x165/0x820 [ ] __dev_queue_xmit+0x1ff0/0x2b90 [ ] ip_finish_output2+0xd3e/0x1480 [ ] ip_do_fragment+0x182d/0x2210 [ ] ip_output+0x1d0/0x510 [ ] ip_send_skb+0x37/0xa0 [ ] raw_sendmsg+0x1b4c/0x2b80 [ ] sock_sendmsg+0xc0/0x110 This occurred when sending a v4 skb over vxlan6 over ipsec, in which case skb->protocol == htons(ETH_P_IPV6) while skb->sk->sk_family == AF_INET in xfrm_local_error(). Then it will go to xfrm6_local_error() where it tries to get ipv6 info from a ipv4 sk. This issue was actually fixed by Commit 628e341f319f ("xfrm: make local error reporting more robust"), but brought back by Commit 844d48746e4b ("xfrm: choose protocol family by skb protocol"). So to fix it, we should call xfrm6_local_error() only when skb->protocol is htons(ETH_P_IPV6) and skb->sk->sk_family is AF_INET6. Fixes: 844d48746e4b ("xfrm: choose protocol family by skb protocol") Reported-by: Xiumei Mu Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 69c33cab8f49a..69c4900db8172 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -642,7 +642,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu) if (skb->protocol == htons(ETH_P_IP)) proto = AF_INET; - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (skb->protocol == htons(ETH_P_IPV6) && + skb->sk->sk_family == AF_INET6) proto = AF_INET6; else return; From 8fc3e29be9248048f449793502c15af329f35c6e Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 20 May 2020 17:32:08 +0000 Subject: [PATCH 55/77] net/mlx5: Fix crash upon suspend/resume Currently a Linux system with the mlx5 NIC always crashes upon hibernation - suspend/resume. Add basic callbacks so the NIC could be suspended and resumed. Fixes: 9603b61de1ee ("mlx5: Move pci device handling from mlx5_ib to mlx5_core") Tested-by: Dexuan Cui Signed-off-by: Mark Bloch Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c1618b818f3ab..17f818a540903 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1549,6 +1549,22 @@ static void shutdown(struct pci_dev *pdev) mlx5_pci_disable_device(dev); } +static int mlx5_suspend(struct pci_dev *pdev, pm_message_t state) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + + mlx5_unload_one(dev, false); + + return 0; +} + +static int mlx5_resume(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + + return mlx5_load_one(dev, false); +} + static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTIB) }, { PCI_VDEVICE(MELLANOX, 0x1012), MLX5_PCI_DEV_IS_VF}, /* Connect-IB VF */ @@ -1592,6 +1608,8 @@ static struct pci_driver mlx5_core_driver = { .id_table = mlx5_core_pci_table, .probe = init_one, .remove = remove_one, + .suspend = mlx5_suspend, + .resume = mlx5_resume, .shutdown = shutdown, .err_handler = &mlx5_err_handler, .sriov_configure = mlx5_core_sriov_configure, From 0a2a6f498fa060cc0d592d56148da856e9d77bd8 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 27 May 2020 21:46:09 +0300 Subject: [PATCH 56/77] net/mlx5e: Fix stats update for matchall classifier It's bytes, packets, lastused. Fixes: fcb64c0f5640 ("net/mlx5: E-Switch, add ingress rate support") Signed-off-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 5bcf95fcdd59f..cac36c27c7fa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4614,7 +4614,7 @@ void mlx5e_tc_stats_matchall(struct mlx5e_priv *priv, dpkts = cur_stats.rx_packets - rpriv->prev_vf_vport_stats.rx_packets; dbytes = cur_stats.rx_bytes - rpriv->prev_vf_vport_stats.rx_bytes; rpriv->prev_vf_vport_stats = cur_stats; - flow_stats_update(&ma->stats, dpkts, dbytes, jiffies, + flow_stats_update(&ma->stats, dbytes, dpkts, jiffies, FLOW_ACTION_HW_STATS_DELAYED); } From 20300aafa7a2719f71d50f97a8846459d9869b75 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Sun, 24 May 2020 09:45:44 +0300 Subject: [PATCH 57/77] net/mlx5e: Remove warning "devices are not on same switch HW" On tunnel decap rule insertion, the indirect mechanism will attempt to offload the rule on all uplink representors which will trigger the "devices are not on same switch HW, can't offload forwarding" message for the uplink which isn't on the same switch HW as the VF representor. The above flow is valid and shouldn't cause warning message, fix by removing the warning and only report this flow using extack. Fixes: 321348475d54 ("net/mlx5e: Fix allowed tc redirect merged eswitch offload cases") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index cac36c27c7fa4..6e7b2ce29d411 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3849,10 +3849,6 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, if (!mlx5e_is_valid_eswitch_fwd_dev(priv, out_dev)) { NL_SET_ERR_MSG_MOD(extack, "devices are not on same switch HW, can't offload forwarding"); - netdev_warn(priv->netdev, - "devices %s %s not on same switch HW, can't offload forwarding\n", - priv->netdev->name, - out_dev->name); return -EOPNOTSUPP; } From b623603bbb473e7e19af358b98335e63bebb2eb5 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 13 Apr 2020 11:31:00 +0300 Subject: [PATCH 58/77] net/mlx5e: Fix arch depending casting issue in FEC Change type of active_fec to u32 to match the type expected by mlx5e_get_fec_mode. Copy active_fec and configured_fec values to unsigned long before preforming bitwise manipulations. Take the same approach when configuring FEC over 50G link modes: copy the policy into an unsigned long and only than preform bitwise operations. Fixes: 2132b71f78d2 ("net/mlx5e: Advertise globaly supported FEC modes") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/port.c | 24 ++++++++++--------- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 20 +++++++++------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c index 2c4a670c8ffd4..2a8950b3056f9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c @@ -369,17 +369,19 @@ enum mlx5e_fec_supported_link_mode { *_policy = MLX5_GET(pplm_reg, _buf, fec_override_admin_##link); \ } while (0) -#define MLX5E_FEC_OVERRIDE_ADMIN_50G_POLICY(buf, policy, write, link) \ - do { \ - u16 *__policy = &(policy); \ - bool _write = (write); \ - \ - if (_write && *__policy) \ - *__policy = find_first_bit((u_long *)__policy, \ - sizeof(u16) * BITS_PER_BYTE);\ - MLX5E_FEC_OVERRIDE_ADMIN_POLICY(buf, *__policy, _write, link); \ - if (!_write && *__policy) \ - *__policy = 1 << *__policy; \ +#define MLX5E_FEC_OVERRIDE_ADMIN_50G_POLICY(buf, policy, write, link) \ + do { \ + unsigned long policy_long; \ + u16 *__policy = &(policy); \ + bool _write = (write); \ + \ + policy_long = *__policy; \ + if (_write && *__policy) \ + *__policy = find_first_bit(&policy_long, \ + sizeof(policy_long) * BITS_PER_BYTE);\ + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(buf, *__policy, _write, link); \ + if (!_write && *__policy) \ + *__policy = 1 << *__policy; \ } while (0) /* get/set FEC admin field for a given speed */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 6d703ddee4e27..6f582eb83e54f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -665,11 +665,12 @@ static const u32 pplm_fec_2_ethtool_linkmodes[] = { static int get_fec_supported_advertised(struct mlx5_core_dev *dev, struct ethtool_link_ksettings *link_ksettings) { - u_long active_fec = 0; + unsigned long active_fec_long; + u32 active_fec; u32 bitn; int err; - err = mlx5e_get_fec_mode(dev, (u32 *)&active_fec, NULL); + err = mlx5e_get_fec_mode(dev, &active_fec, NULL); if (err) return (err == -EOPNOTSUPP) ? 0 : err; @@ -682,10 +683,11 @@ static int get_fec_supported_advertised(struct mlx5_core_dev *dev, MLX5E_ADVERTISE_SUPPORTED_FEC(MLX5E_FEC_LLRS_272_257_1, ETHTOOL_LINK_MODE_FEC_LLRS_BIT); + active_fec_long = active_fec; /* active fec is a bit set, find out which bit is set and * advertise the corresponding ethtool bit */ - bitn = find_first_bit(&active_fec, sizeof(u32) * BITS_PER_BYTE); + bitn = find_first_bit(&active_fec_long, sizeof(active_fec_long) * BITS_PER_BYTE); if (bitn < ARRAY_SIZE(pplm_fec_2_ethtool_linkmodes)) __set_bit(pplm_fec_2_ethtool_linkmodes[bitn], link_ksettings->link_modes.advertising); @@ -1517,8 +1519,8 @@ static int mlx5e_get_fecparam(struct net_device *netdev, { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; - u16 fec_configured = 0; - u32 fec_active = 0; + u16 fec_configured; + u32 fec_active; int err; err = mlx5e_get_fec_mode(mdev, &fec_active, &fec_configured); @@ -1526,14 +1528,14 @@ static int mlx5e_get_fecparam(struct net_device *netdev, if (err) return err; - fecparam->active_fec = pplm2ethtool_fec((u_long)fec_active, - sizeof(u32) * BITS_PER_BYTE); + fecparam->active_fec = pplm2ethtool_fec((unsigned long)fec_active, + sizeof(unsigned long) * BITS_PER_BYTE); if (!fecparam->active_fec) return -EOPNOTSUPP; - fecparam->fec = pplm2ethtool_fec((u_long)fec_configured, - sizeof(u16) * BITS_PER_BYTE); + fecparam->fec = pplm2ethtool_fec((unsigned long)fec_configured, + sizeof(unsigned long) * BITS_PER_BYTE); return 0; } From ebeaf084ad5c0eeaf8ea3314f62cc28cb79d529f Mon Sep 17 00:00:00 2001 From: Tal Gilboa Date: Thu, 23 Apr 2020 13:23:06 +0300 Subject: [PATCH 59/77] net/mlx5e: Properly set default values when disabling adaptive moderation Add a call to mlx5e_reset_rx/tx_moderation() when enabling/disabling adaptive moderation, in order to select the proper default values. In order to do so, we separate the logic of selecting the moderation values and setting moderion mode (CQE/EQE based). Fixes: 0088cbbc4b66 ("net/mlx5e: Enable CQE based moderation on TX CQ") Fixes: 9908aa292971 ("net/mlx5e: CQE based moderation") Signed-off-by: Tal Gilboa Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 10 +++++---- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 21 +++++++++++++++---- .../net/ethernet/mellanox/mlx5/core/en_main.c | 20 ++++++++++++------ 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 59745402747be..0a5aada0f50f9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1068,10 +1068,12 @@ void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv); void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len, int num_channels); -void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, - u8 cq_period_mode); -void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, - u8 cq_period_mode); + +void mlx5e_reset_tx_moderation(struct mlx5e_params *params, u8 cq_period_mode); +void mlx5e_reset_rx_moderation(struct mlx5e_params *params, u8 cq_period_mode); +void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode); +void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode); + void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params); void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 6f582eb83e54f..bc290ae80a531 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -527,8 +527,8 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, struct dim_cq_moder *rx_moder, *tx_moder; struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_channels new_channels = {}; + bool reset_rx, reset_tx; int err = 0; - bool reset; if (!MLX5_CAP_GEN(mdev, cq_moderation)) return -EOPNOTSUPP; @@ -566,15 +566,28 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, } /* we are opened */ - reset = (!!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled) || - (!!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled); + reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled; + reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled; - if (!reset) { + if (!reset_rx && !reset_tx) { mlx5e_set_priv_channels_coalesce(priv, coal); priv->channels.params = new_channels.params; goto out; } + if (reset_rx) { + u8 mode = MLX5E_GET_PFLAG(&new_channels.params, + MLX5E_PFLAG_RX_CQE_BASED_MODER); + + mlx5e_reset_rx_moderation(&new_channels.params, mode); + } + if (reset_tx) { + u8 mode = MLX5E_GET_PFLAG(&new_channels.params, + MLX5E_PFLAG_TX_CQE_BASED_MODER); + + mlx5e_reset_tx_moderation(&new_channels.params, mode); + } + err = mlx5e_safe_switch_channels(priv, &new_channels, NULL, NULL); out: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c6b83042d4318..bd8d0e0960857 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4716,7 +4716,7 @@ static u8 mlx5_to_net_dim_cq_period_mode(u8 cq_period_mode) DIM_CQ_PERIOD_MODE_START_FROM_EQE; } -void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) +void mlx5e_reset_tx_moderation(struct mlx5e_params *params, u8 cq_period_mode) { if (params->tx_dim_enabled) { u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode); @@ -4725,13 +4725,9 @@ void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) } else { params->tx_cq_moderation = mlx5e_get_def_tx_moderation(cq_period_mode); } - - MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_BASED_MODER, - params->tx_cq_moderation.cq_period_mode == - MLX5_CQ_PERIOD_MODE_START_FROM_CQE); } -void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) +void mlx5e_reset_rx_moderation(struct mlx5e_params *params, u8 cq_period_mode) { if (params->rx_dim_enabled) { u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode); @@ -4740,7 +4736,19 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) } else { params->rx_cq_moderation = mlx5e_get_def_rx_moderation(cq_period_mode); } +} + +void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) +{ + mlx5e_reset_tx_moderation(params, cq_period_mode); + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_BASED_MODER, + params->tx_cq_moderation.cq_period_mode == + MLX5_CQ_PERIOD_MODE_START_FROM_CQE); +} +void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) +{ + mlx5e_reset_rx_moderation(params, cq_period_mode); MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_BASED_MODER, params->rx_cq_moderation.cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE); From cb9a0641b531ac11cd7e3076de23ceada19b892e Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 25 May 2020 16:57:51 +0300 Subject: [PATCH 60/77] net/mlx5e: Fix MLX5_TC_CT dependencies Change MLX5_TC_CT config dependencies to include MLX5_ESWITCH instead of MLX5_CORE_EN && NET_SWITCHDEV, which are already required by MLX5_ESWITCH. Without this change mlx5 fails to compile if user disables MLX5_ESWITCH without also manually disabling MLX5_TC_CT. Fixes: 4c3844d9e97e ("net/mlx5e: CT: Introduce connection tracking") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 7d69a3061f178..fd375cbe586e1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -80,7 +80,7 @@ config MLX5_ESWITCH config MLX5_TC_CT bool "MLX5 TC connection tracking offload support" - depends on MLX5_CORE_EN && NET_SWITCHDEV && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT + depends on MLX5_ESWITCH && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT default y help Say Y here if you want to support offloading connection tracking rules From a683012a8e77675a1947cc8f11f97cdc1d5bb769 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 19 Apr 2020 14:12:35 +0200 Subject: [PATCH 61/77] net/mlx5e: replace EINVAL in mlx5e_flower_parse_meta() The drivers reports EINVAL to userspace through netlink on invalid meta match. This is confusing since EINVAL is usually reserved for malformed netlink messages. Replace it by more meaningful codes. Fixes: 6d65bc64e232 ("net/mlx5e: Add mlx5e_flower_parse_meta support") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 6e7b2ce29d411..10f705761666b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2068,7 +2068,7 @@ static int mlx5e_flower_parse_meta(struct net_device *filter_dev, flow_rule_match_meta(rule, &match); if (match.mask->ingress_ifindex != 0xFFFFFFFF) { NL_SET_ERR_MSG_MOD(extack, "Unsupported ingress ifindex mask"); - return -EINVAL; + return -EOPNOTSUPP; } ingress_dev = __dev_get_by_index(dev_net(filter_dev), @@ -2076,13 +2076,13 @@ static int mlx5e_flower_parse_meta(struct net_device *filter_dev, if (!ingress_dev) { NL_SET_ERR_MSG_MOD(extack, "Can't find the ingress port to match on"); - return -EINVAL; + return -ENOENT; } if (ingress_dev != filter_dev) { NL_SET_ERR_MSG_MOD(extack, "Can't match on the ingress filter port"); - return -EINVAL; + return -EOPNOTSUPP; } return 0; From 18644cec714aabbab173729192c7594ee57a406b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 28 May 2020 21:38:36 -0700 Subject: [PATCH 62/77] bpf: Fix use-after-free in fmod_ret check Fix the following issue: [ 436.749342] BUG: KASAN: use-after-free in bpf_trampoline_put+0x39/0x2a0 [ 436.749995] Write of size 4 at addr ffff8881ef38b8a0 by task kworker/3:5/2243 [ 436.750712] [ 436.752677] Workqueue: events bpf_prog_free_deferred [ 436.753183] Call Trace: [ 436.756483] bpf_trampoline_put+0x39/0x2a0 [ 436.756904] bpf_prog_free_deferred+0x16d/0x3d0 [ 436.757377] process_one_work+0x94a/0x15b0 [ 436.761969] [ 436.762130] Allocated by task 2529: [ 436.763323] bpf_trampoline_lookup+0x136/0x540 [ 436.763776] bpf_check+0x2872/0xa0a8 [ 436.764144] bpf_prog_load+0xb6f/0x1350 [ 436.764539] __do_sys_bpf+0x16d7/0x3720 [ 436.765825] [ 436.765988] Freed by task 2529: [ 436.767084] kfree+0xc6/0x280 [ 436.767397] bpf_trampoline_put+0x1fd/0x2a0 [ 436.767826] bpf_check+0x6832/0xa0a8 [ 436.768197] bpf_prog_load+0xb6f/0x1350 [ 436.768594] __do_sys_bpf+0x16d7/0x3720 prog->aux->trampoline = tr should be set only when prog is valid. Otherwise prog freeing will try to put trampoline via prog->aux->trampoline, but it may not point to a valid trampoline. Fixes: 6ba43b761c41 ("bpf: Attachment verification for BPF_MODIFY_RETURN") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200529043839.15824-2-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d7ee40e27484..bfea3f9a972fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10428,22 +10428,13 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(struct bpf_verifier_env *env) +static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) { - struct bpf_prog *prog = env->prog; - unsigned long addr = (unsigned long) prog->aux->trampoline->func.addr; - - /* This is expected to be cleaned up in the future with the KRSI effort - * introducing the LSM_HOOK macro for cleaning up lsm_hooks.h. - */ if (within_error_injection_list(addr) || !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, sizeof(SECURITY_PREFIX) - 1)) return 0; - verbose(env, "fmod_ret attach_btf_id %u (%s) is not modifiable\n", - prog->aux->attach_btf_id, prog->aux->attach_func_name); - return -EINVAL; } @@ -10654,11 +10645,18 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) goto out; } } + + if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + ret = check_attach_modify_return(prog, addr); + if (ret) + verbose(env, "%s() is not modifiable\n", + prog->aux->attach_func_name); + } + + if (ret) + goto out; tr->func.addr = (void *)addr; prog->aux->trampoline = tr; - - if (prog->expected_attach_type == BPF_MODIFY_RETURN) - ret = check_attach_modify_return(env); out: mutex_unlock(&tr->mutex); if (ret) From 3a71dc366d4aa51a22f385445f2862231d3fda3b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 10:28:40 -0700 Subject: [PATCH 63/77] bpf: Fix a verifier issue when assigning 32bit reg states to 64bit ones With the latest trunk llvm (llvm 11), I hit a verifier issue for test_prog subtest test_verif_scale1. The following simplified example illustrate the issue: w9 = 0 /* R9_w=inv0 */ r8 = *(u32 *)(r1 + 80) /* __sk_buff->data_end */ r7 = *(u32 *)(r1 + 76) /* __sk_buff->data */ ...... w2 = w9 /* R2_w=inv0 */ r6 = r7 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r6 += r2 /* R6_w=inv(id=0) */ r3 = r6 /* R3_w=inv(id=0) */ r3 += 14 /* R3_w=inv(id=0) */ if r3 > r8 goto end r5 = *(u32 *)(r6 + 0) /* R6_w=inv(id=0) */ <== error here: R6 invalid mem access 'inv' ... end: In real test_verif_scale1 code, "w9 = 0" and "w2 = w9" are in different basic blocks. In the above, after "r6 += r2", r6 becomes a scalar, which eventually caused the memory access error. The correct register state should be a pkt pointer. The inprecise register state starts at "w2 = w9". The 32bit register w9 is 0, in __reg_assign_32_into_64(), the 64bit reg->smax_value is assigned to be U32_MAX. The 64bit reg->smin_value is 0 and the 64bit register itself remains constant based on reg->var_off. In adjust_ptr_min_max_vals(), the verifier checks for a known constant, smin_val must be equal to smax_val. Since they are not equal, the verifier decides r6 is a unknown scalar, which caused later failure. The llvm10 does not have this issue as it generates different code: w9 = 0 /* R9_w=inv0 */ r8 = *(u32 *)(r1 + 80) /* __sk_buff->data_end */ r7 = *(u32 *)(r1 + 76) /* __sk_buff->data */ ...... r6 = r7 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r6 += r9 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r3 = r6 /* R3_w=pkt(id=0,off=0,r=0,imm=0) */ r3 += 14 /* R3_w=pkt(id=0,off=14,r=0,imm=0) */ if r3 > r8 goto end ... To fix the above issue, we can include zero in the test condition for assigning the s32_max_value and s32_min_value to their 64-bit equivalents smax_value and smin_value. Further, fix the condition to avoid doing zero extension bounds checks when s32_min_value <= 0. This could allow for the case where bounds 32-bit bounds (-1,1) get incorrectly translated to (0,1) 64-bit bounds. When in-fact the -1 min value needs to force U32_MAX bound. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159077331983.6014.5758956193749002737.stgit@john-Precision-5820-Tower --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bfea3f9a972fd..efe14cf24bc65 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1168,14 +1168,14 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) * but must be positive otherwise set to worse case bounds * and refine later from tnum. */ - if (reg->s32_min_value > 0) - reg->smin_value = reg->s32_min_value; - else - reg->smin_value = 0; - if (reg->s32_max_value > 0) + if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) reg->smax_value = reg->s32_max_value; else reg->smax_value = U32_MAX; + if (reg->s32_min_value >= 0) + reg->smin_value = reg->s32_min_value; + else + reg->smin_value = 0; } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) From e3effcdfe02eb20f23d4f1833c75658ddc49e65a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 10:28:59 -0700 Subject: [PATCH 64/77] bpf, selftests: Verifier bounds tests need to be updated After previous fix for zero extension test_verifier tests #65 and #66 now fail. Before the fix we can see the alu32 mov op at insn 10 10: R0_w=map_value(id=0,off=0,ks=8,vs=8,imm=0) R1_w=invP(id=0, smin_value=4294967168,smax_value=4294967423, umin_value=4294967168,umax_value=4294967423, var_off=(0x0; 0x1ffffffff), s32_min_value=-2147483648,s32_max_value=2147483647, u32_min_value=0,u32_max_value=-1) R10=fp0 fp-8_w=mmmmmmmm 10: (bc) w1 = w1 11: R0_w=map_value(id=0,off=0,ks=8,vs=8,imm=0) R1_w=invP(id=0, smin_value=0,smax_value=2147483647, umin_value=0,umax_value=4294967295, var_off=(0x0; 0xffffffff), s32_min_value=-2147483648,s32_max_value=2147483647, u32_min_value=0,u32_max_value=-1) R10=fp0 fp-8_w=mmmmmmmm After the fix at insn 10 because we have 's32_min_value < 0' the following step 11 now has 'smax_value=U32_MAX' where before we pulled the s32_max_value bound into the smax_value as seen above in 11 with smax_value=2147483647. 10: R0_w=map_value(id=0,off=0,ks=8,vs=8,imm=0) R1_w=inv(id=0, smin_value=4294967168,smax_value=4294967423, umin_value=4294967168,umax_value=4294967423, var_off=(0x0; 0x1ffffffff), s32_min_value=-2147483648, s32_max_value=2147483647, u32_min_value=0,u32_max_value=-1) R10=fp0 fp-8_w=mmmmmmmm 10: (bc) w1 = w1 11: R0_w=map_value(id=0,off=0,ks=8,vs=8,imm=0) R1_w=inv(id=0, smin_value=0,smax_value=4294967295, umin_value=0,umax_value=4294967295, var_off=(0x0; 0xffffffff), s32_min_value=-2147483648, s32_max_value=2147483647, u32_min_value=0, u32_max_value=-1) R10=fp0 fp-8_w=mmmmmmmm The fall out of this is by the time we get to the failing instruction at step 14 where previously we had the following: 14: R0_w=map_value(id=0,off=0,ks=8,vs=8,imm=0) R1_w=inv(id=0, smin_value=72057594021150720,smax_value=72057594029539328, umin_value=72057594021150720,umax_value=72057594029539328, var_off=(0xffffffff000000; 0xffffff), s32_min_value=-16777216,s32_max_value=-1, u32_min_value=-16777216,u32_max_value=-1) R10=fp0 fp-8_w=mmmmmmmm 14: (0f) r0 += r1 We now have, 14: R0_w=map_value(id=0,off=0,ks=8,vs=8,imm=0) R1_w=inv(id=0, smin_value=0,smax_value=72057594037927935, umin_value=0,umax_value=72057594037927935, var_off=(0x0; 0xffffffffffffff), s32_min_value=-2147483648,s32_max_value=2147483647, u32_min_value=0,u32_max_value=-1) R10=fp0 fp-8_w=mmmmmmmm 14: (0f) r0 += r1 In the original step 14 'smin_value=72057594021150720' this trips the logic in the verifier function check_reg_sane_offset(), if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { verbose(env, "value %lld makes %s pointer be out of bounds\n", smin, reg_type_str[type]); return false; } Specifically, the 'smin <= -BPF_MAX_VAR_OFF' check. But with the fix at step 14 we have bounds 'smin_value=0' so the above check is not tripped because BPF_MAX_VAR_OFF=1<<29. We have a smin_value=0 here because at step 10 the smaller smin_value=0 means the subtractions at steps 11 and 12 bring the smin_value negative. 11: (17) r1 -= 2147483584 12: (17) r1 -= 2147483584 13: (77) r1 >>= 8 Then the shift clears the top bit and smin_value is set to 0. Note we still have the smax_value in the fixed code so any reads will fail. An alternative would be to have reg_sane_check() do both smin and smax value tests. To fix the test we can omit the 'r1 >>=8' at line 13. This will change the err string, but keeps the intention of the test as suggseted by the title, "check after truncation of boundary-crossing range". If the verifier logic changes a different value is likely to be thrown in the error or the error will no longer be thrown forcing this test to be examined. With this change we see the new state at step 13. 13: R0_w=map_value(id=0,off=0,ks=8,vs=8,imm=0) R1_w=invP(id=0, smin_value=-4294967168,smax_value=127, umin_value=0,umax_value=18446744073709551615, s32_min_value=-2147483648,s32_max_value=2147483647, u32_min_value=0,u32_max_value=-1) R10=fp0 fp-8_w=mmmmmmmm Giving the expected out of bounds error, "value -4294967168 makes map_value pointer be out of bounds" However, for unpriv case we see a different error now because of the mixed signed bounds pointer arithmatic. This seems OK so I've only added the unpriv_errstr for this. Another optino may have been to do addition on r1 instead of subtraction but I favor the approach above slightly. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159077333942.6014.14004320043595756079.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/verifier/bounds.c | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index a253a064e6e05..fafa5409bdac2 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -238,7 +238,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), /* r1 = [0x00, 0xff] */ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), @@ -253,10 +253,6 @@ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff] */ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), - /* r1 = 0 or - * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] - */ - BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), /* error on OOB pointer computation */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), /* exit */ @@ -265,8 +261,10 @@ }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr = "value 72057594021150720 makes map_value pointer be out of bounds", - .result = REJECT + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", + .result_unpriv = REJECT, + .errstr = "value -4294967168 makes map_value pointer be out of bounds", + .result = REJECT, }, { "bounds check after truncation of boundary-crossing range (2)", @@ -276,7 +274,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), /* r1 = [0x00, 0xff] */ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), @@ -293,10 +291,6 @@ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff] */ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), - /* r1 = 0 or - * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] - */ - BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), /* error on OOB pointer computation */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), /* exit */ @@ -305,8 +299,10 @@ }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr = "value 72057594021150720 makes map_value pointer be out of bounds", - .result = REJECT + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", + .result_unpriv = REJECT, + .errstr = "value -4294967168 makes map_value pointer be out of bounds", + .result = REJECT, }, { "bounds check after wrapping 32-bit addition", From cf66c29bd7534813d2e1971fab71e25fe87c7e0a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 10:29:18 -0700 Subject: [PATCH 65/77] bpf, selftests: Add a verifier test for assigning 32bit reg states to 64bit ones Added a verifier test for assigning 32bit reg states to 64bit where 32bit reg holds a constant value of 0. Without previous kernel verifier.c fix, the test in this patch will fail. Signed-off-by: Yonghong Song Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159077335867.6014.2075350327073125374.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/verifier/bounds.c | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index fafa5409bdac2..58f4aa593b1b5 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -535,3 +535,25 @@ }, .result = ACCEPT }, +{ + "assigning 32bit bounds to 64bit for wA = 0, wB = wA", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_MOV32_IMM(BPF_REG_9, 0), + BPF_MOV32_REG(BPF_REG_2, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_8, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, +}, From 96d10d5b192e7f064457fd957e6a3ce8c2dce4b4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 28 May 2020 15:15:13 +0800 Subject: [PATCH 66/77] neigh: fix ARP retransmit timer guard In commit 19e16d220f0a ("neigh: support smaller retrans_time settting") we add more accurate control for ARP and NS. But for ARP I forgot to update the latest guard in neigh_timer_handler(), then the next retransmit would be reset to jiffies + HZ/2 if we set the retrans_time less than 500ms. Fix it by setting the time_before() check to HZ/100. IPv6 does not have this issue. Reported-by: Jianwen Ji Fixes: 19e16d220f0a ("neigh: support smaller retrans_time settting") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 116139233d573..dbe0c6ead773f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1082,8 +1082,8 @@ static void neigh_timer_handler(struct timer_list *t) } if (neigh->nud_state & NUD_IN_TIMER) { - if (time_before(next, jiffies + HZ/2)) - next = jiffies + HZ/2; + if (time_before(next, jiffies + HZ/100)) + next = jiffies + HZ/100; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } From 3decabdc714ca56c944f4669b4cdec5c2c1cea23 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Thu, 28 May 2020 18:20:37 +0800 Subject: [PATCH 67/77] NFC: st21nfca: add missed kfree_skb() in an error path st21nfca_tm_send_atr_res() misses to call kfree_skb() in an error path. Add the missed function call to fix it. Fixes: 1892bf844ea0 ("NFC: st21nfca: Adding P2P support to st21nfca in Initiator & Target mode") Signed-off-by: Chuhong Yuan Signed-off-by: David S. Miller --- drivers/nfc/st21nfca/dep.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c index a1d69f9b2d4a4..0b9ca6d20ffad 100644 --- a/drivers/nfc/st21nfca/dep.c +++ b/drivers/nfc/st21nfca/dep.c @@ -173,8 +173,10 @@ static int st21nfca_tm_send_atr_res(struct nfc_hci_dev *hdev, memcpy(atr_res->gbi, atr_req->gbi, gb_len); r = nfc_set_remote_general_bytes(hdev->ndev, atr_res->gbi, gb_len); - if (r < 0) + if (r < 0) { + kfree_skb(skb); return r; + } } info->dep_info.curr_nfc_dep_pni = 0; From 784688993ebac34dffe44a9f2fabbe126ebfd4db Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Thu, 28 May 2020 11:19:17 -0500 Subject: [PATCH 68/77] drivers/net/ibmvnic: Update VNIC protocol version reporting VNIC protocol version is reported in big-endian format, but it is not byteswapped before logging. Fix that, and remove version comparison as only one protocol version exists at this time. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 3de549c6c6930..197dc5b2c0905 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4678,12 +4678,10 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, dev_err(dev, "Error %ld in VERSION_EXCHG_RSP\n", rc); break; } - dev_info(dev, "Partner protocol version is %d\n", - crq->version_exchange_rsp.version); - if (be16_to_cpu(crq->version_exchange_rsp.version) < - ibmvnic_version) - ibmvnic_version = + ibmvnic_version = be16_to_cpu(crq->version_exchange_rsp.version); + dev_info(dev, "Partner protocol version is %d\n", + ibmvnic_version); send_cap_queries(adapter); break; case QUERY_CAPABILITY_RSP: From 8692cefc433f282228fd44938dd4d26ed38254a2 Mon Sep 17 00:00:00 2001 From: Jia He Date: Sat, 30 May 2020 09:38:28 +0800 Subject: [PATCH 69/77] virtio_vsock: Fix race condition in virtio_transport_recv_pkt When client on the host tries to connect(SOCK_STREAM, O_NONBLOCK) to the server on the guest, there will be a panic on a ThunderX2 (armv8a server): [ 463.718844] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 463.718848] Mem abort info: [ 463.718849] ESR = 0x96000044 [ 463.718852] EC = 0x25: DABT (current EL), IL = 32 bits [ 463.718853] SET = 0, FnV = 0 [ 463.718854] EA = 0, S1PTW = 0 [ 463.718855] Data abort info: [ 463.718856] ISV = 0, ISS = 0x00000044 [ 463.718857] CM = 0, WnR = 1 [ 463.718859] user pgtable: 4k pages, 48-bit VAs, pgdp=0000008f6f6e9000 [ 463.718861] [0000000000000000] pgd=0000000000000000 [ 463.718866] Internal error: Oops: 96000044 [#1] SMP [...] [ 463.718977] CPU: 213 PID: 5040 Comm: vhost-5032 Tainted: G O 5.7.0-rc7+ #139 [ 463.718980] Hardware name: GIGABYTE R281-T91-00/MT91-FS1-00, BIOS F06 09/25/2018 [ 463.718982] pstate: 60400009 (nZCv daif +PAN -UAO) [ 463.718995] pc : virtio_transport_recv_pkt+0x4c8/0xd40 [vmw_vsock_virtio_transport_common] [ 463.718999] lr : virtio_transport_recv_pkt+0x1fc/0xd40 [vmw_vsock_virtio_transport_common] [ 463.719000] sp : ffff80002dbe3c40 [...] [ 463.719025] Call trace: [ 463.719030] virtio_transport_recv_pkt+0x4c8/0xd40 [vmw_vsock_virtio_transport_common] [ 463.719034] vhost_vsock_handle_tx_kick+0x360/0x408 [vhost_vsock] [ 463.719041] vhost_worker+0x100/0x1a0 [vhost] [ 463.719048] kthread+0x128/0x130 [ 463.719052] ret_from_fork+0x10/0x18 The race condition is as follows: Task1 Task2 ===== ===== __sock_release virtio_transport_recv_pkt __vsock_release vsock_find_bound_socket (found sk) lock_sock_nested vsock_remove_sock sock_orphan sk_set_socket(sk, NULL) sk->sk_shutdown = SHUTDOWN_MASK ... release_sock lock_sock virtio_transport_recv_connecting sk->sk_socket->state (panic!) The root cause is that vsock_find_bound_socket can't hold the lock_sock, so there is a small race window between vsock_find_bound_socket() and lock_sock(). If __vsock_release() is running in another task, sk->sk_socket will be set to NULL inadvertently. This fixes it by checking sk->sk_shutdown(suggested by Stefano) after lock_sock since sk->sk_shutdown is set to SHUTDOWN_MASK under the protection of lock_sock_nested. Signed-off-by: Jia He Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 69efc891885f6..0edda1edf9882 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1132,6 +1132,14 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, lock_sock(sk); + /* Check if sk has been released before lock_sock */ + if (sk->sk_shutdown == SHUTDOWN_MASK) { + (void)virtio_transport_reset_no_sock(t, pkt); + release_sock(sk); + sock_put(sk); + goto free_pkt; + } + /* Update CID in case it has changed after a transport reset event */ vsk->local_addr.svm_cid = dst.svm_cid; From 1b49cd71b52403822731dc9f283185d1da355f97 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 30 May 2020 11:34:33 +0800 Subject: [PATCH 70/77] devinet: fix memleak in inetdev_init() When devinet_sysctl_register() failed, the memory allocated in neigh_parms_alloc() should be freed. Fixes: 20e61da7ffcf ("ipv4: fail early when creating netdev named all or default") Signed-off-by: Yang Yingliang Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c0dd561aa1903..5267b6b191eba 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -276,6 +276,7 @@ static struct in_device *inetdev_init(struct net_device *dev) err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; + neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; From 05aa69e5cba61d8900970a364825f854401694ec Mon Sep 17 00:00:00 2001 From: wenxu Date: Sat, 30 May 2020 13:54:51 +0800 Subject: [PATCH 71/77] net/sched: act_ct: add nat mangle action only for NAT-conntrack Currently add nat mangle action with comparing invert and orig tuple. It is better to check IPS_NAT_MASK flags first to avoid non necessary memcmp for non-NAT conntrack. Signed-off-by: wenxu Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sched/act_ct.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 1a766393be625..20577355235a6 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -199,6 +199,9 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net, const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; struct nf_conntrack_tuple target; + if (!(ct->status & IPS_NAT_MASK)) + return 0; + nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); switch (tuple->src.l3num) { From 41be81a8d3d09acb9033799938306349328861f9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 May 2020 17:43:29 +0200 Subject: [PATCH 72/77] mptcp: fix unblocking connect() Currently unblocking connect() on MPTCP sockets fails frequently. If mptcp_stream_connect() is invoked to complete a previously attempted unblocking connection, it will still try to create the first subflow via __mptcp_socket_create(). If the 3whs is completed and the 'can_ack' flag is already set, the latter will fail with -EINVAL. This change addresses the issue checking for pending connect and delegating the completion to the first subflow. Additionally do msk addresses and sk_state changes only when needed. Fixes: 2303f994b3e1 ("mptcp: Associate MPTCP context with TCP socket") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c8675d2eb5b97..8959a74f707d4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1712,6 +1712,14 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int err; lock_sock(sock->sk); + if (sock->state != SS_UNCONNECTED && msk->subflow) { + /* pending connection or invalid state, let existing subflow + * cope with that + */ + ssock = msk->subflow; + goto do_connect; + } + ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); if (IS_ERR(ssock)) { err = PTR_ERR(ssock); @@ -1726,9 +1734,17 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; #endif +do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - mptcp_copy_inaddrs(sock->sk, ssock->sk); + sock->state = ssock->state; + + /* on successful connect, the msk state will be moved to established by + * subflow_finish_connect() + */ + if (!err || err == EINPROGRESS) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + else + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); unlock: release_sock(sock->sk); From 10f6d46c943d1ccd1d730d2e915226cbdeaad2d5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 May 2020 17:43:30 +0200 Subject: [PATCH 73/77] mptcp: fix race between MP_JOIN and close If a MP_JOIN subflow completes the 3whs while another CPU is closing the master msk, we can hit the following race: CPU1 CPU2 close() mptcp_close subflow_syn_recv_sock mptcp_token_get_sock mptcp_finish_join inet_sk_state_load mptcp_token_destroy inet_sk_state_store(TCP_CLOSE) __mptcp_flush_join_list() mptcp_sock_graft list_add_tail sk_common_release sock_orphan() The MP_JOIN socket will be leaked. Additionally we can hit UaF for the msk 'struct socket' referenced via the 'conn' field. This change try to address the issue introducing some synchronization between the MP_JOIN 3whs and mptcp_close via the join_list spinlock. If we detect the msk is closing the MP_JOIN socket is closed, too. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8959a74f707d4..35bdfb4f3eaed 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1266,8 +1266,12 @@ static void mptcp_close(struct sock *sk, long timeout) mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); - __mptcp_flush_join_list(msk); - + /* be sure to always acquire the join list lock, to sync vs + * mptcp_finish_join(). + */ + spin_lock_bh(&msk->join_list_lock); + list_splice_tail_init(&msk->join_list, &msk->conn_list); + spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); data_fin_tx_seq = msk->write_seq; @@ -1623,22 +1627,30 @@ bool mptcp_finish_join(struct sock *sk) if (!msk->pm.server_side) return true; - /* passive connection, attach to msk socket */ + if (!mptcp_pm_allow_new_subflow(msk)) + return false; + + /* active connections are already on conn_list, and we can't acquire + * msk lock here. + * use the join list lock as synchronization point and double-check + * msk status to avoid racing with mptcp_close() + */ + spin_lock_bh(&msk->join_list_lock); + ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; + if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); + if (!ret) + return false; + + /* attach to msk socket only after we are sure he will deal with us + * at close time + */ parent_sock = READ_ONCE(parent->sk_socket); if (parent_sock && !sk->sk_socket) mptcp_sock_graft(sk, parent_sock); - - ret = mptcp_pm_allow_new_subflow(msk); - if (ret) { - subflow->map_seq = msk->ack_seq; - - /* active connections are already on conn_list */ - spin_lock_bh(&msk->join_list_lock); - if (!WARN_ON_ONCE(!list_empty(&subflow->node))) - list_add_tail(&subflow->node, &msk->join_list); - spin_unlock_bh(&msk->join_list_lock); - } - return ret; + subflow->map_seq = msk->ack_seq; + return true; } bool mptcp_sk_is_subflow(const struct sock *sk) From c5c79763fac115fe827ed6a18ed94fff2432f678 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 May 2020 17:43:31 +0200 Subject: [PATCH 74/77] mptcp: remove msk from the token container at destruction time. Currently we remote the msk from the token container only via mptcp_close(). The MPTCP master socket can be destroyed also via other paths (e.g. if not yet accepted, when shutting down the listener socket). When we hit the latter scenario, dangling msk references are left into the token container, leading to memory corruption and/or UaF. This change addresses the issue by moving the token removal into the msk destructor. Fixes: 79c0949e9a09 ("mptcp: Add key generation and token tree") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 35bdfb4f3eaed..34dd0e278a829 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1263,7 +1263,6 @@ static void mptcp_close(struct sock *sk, long timeout) lock_sock(sk); - mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); /* be sure to always acquire the join list lock, to sync vs @@ -1461,6 +1460,7 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + mptcp_token_destroy(msk->token); if (msk->cached_ext) __skb_ext_put(msk->cached_ext); From c6e08d6251f342090a9b9abccbf26b19bcb54d17 Mon Sep 17 00:00:00 2001 From: Chris Lew Date: Thu, 28 May 2020 16:05:26 -0700 Subject: [PATCH 75/77] net: qrtr: Allocate workqueue before kernel_bind A null pointer dereference in qrtr_ns_data_ready() is seen if a client opens a qrtr socket before qrtr_ns_init() can bind to the control port. When the control port is bound, the ENETRESET error will be broadcasted and clients will close their sockets. This results in DEL_CLIENT packets being sent to the ns and qrtr_ns_data_ready() being called without the workqueue being allocated. Allocate the workqueue before setting sk_data_ready and binding to the control port. This ensures that the work and workqueue structs are allocated and initialized before qrtr_ns_data_ready can be called. Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace") Signed-off-by: Chris Lew Reviewed-by: Bjorn Andersson Reviewed-by: Manivannan Sadhasivam Signed-off-by: David S. Miller --- net/qrtr/ns.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index e7d0fe3f43304..c5b3202a14cae 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -712,6 +712,10 @@ void qrtr_ns_init(void) goto err_sock; } + qrtr_ns.workqueue = alloc_workqueue("qrtr_ns_handler", WQ_UNBOUND, 1); + if (!qrtr_ns.workqueue) + goto err_sock; + qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready; sq.sq_port = QRTR_PORT_CTRL; @@ -720,17 +724,13 @@ void qrtr_ns_init(void) ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq)); if (ret < 0) { pr_err("failed to bind to socket\n"); - goto err_sock; + goto err_wq; } qrtr_ns.bcast_sq.sq_family = AF_QIPCRTR; qrtr_ns.bcast_sq.sq_node = QRTR_NODE_BCAST; qrtr_ns.bcast_sq.sq_port = QRTR_PORT_CTRL; - qrtr_ns.workqueue = alloc_workqueue("qrtr_ns_handler", WQ_UNBOUND, 1); - if (!qrtr_ns.workqueue) - goto err_sock; - ret = say_hello(&qrtr_ns.bcast_sq); if (ret < 0) goto err_wq; From 02c71b144c811bcdd865e0a1226d0407d11357e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 May 2020 11:20:53 -0700 Subject: [PATCH 76/77] l2tp: do not use inet_hash()/inet_unhash() syzbot recently found a way to crash the kernel [1] Issue here is that inet_hash() & inet_unhash() are currently only meant to be used by TCP & DCCP, since only these protocols provide the needed hashinfo pointer. L2TP uses a single list (instead of a hash table) This old bug became an issue after commit 610236587600 ("bpf: Add new cgroup attach type to enable sock modifications") since after this commit, sk_common_release() can be called while the L2TP socket is still considered 'hashed'. general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 PID: 7063 Comm: syz-executor654 Not tainted 5.7.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:inet_unhash+0x11f/0x770 net/ipv4/inet_hashtables.c:600 Code: 03 0f b6 04 02 84 c0 74 08 3c 03 0f 8e dd 04 00 00 48 8d 7d 08 44 8b 73 08 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 55 05 00 00 48 8d 7d 14 4c 8b 6d 08 48 b8 00 00 RSP: 0018:ffffc90001777d30 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff88809a6df940 RCX: ffffffff8697c242 RDX: 0000000000000001 RSI: ffffffff8697c251 RDI: 0000000000000008 RBP: 0000000000000000 R08: ffff88809f3ae1c0 R09: fffffbfff1514cc1 R10: ffffffff8a8a6607 R11: fffffbfff1514cc0 R12: ffff88809a6df9b0 R13: 0000000000000007 R14: 0000000000000000 R15: ffffffff873a4d00 FS: 0000000001d2b880(0000) GS:ffff8880ae600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000006cd090 CR3: 000000009403a000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: sk_common_release+0xba/0x370 net/core/sock.c:3210 inet_create net/ipv4/af_inet.c:390 [inline] inet_create+0x966/0xe00 net/ipv4/af_inet.c:248 __sock_create+0x3cb/0x730 net/socket.c:1428 sock_create net/socket.c:1479 [inline] __sys_socket+0xef/0x200 net/socket.c:1521 __do_sys_socket net/socket.c:1530 [inline] __se_sys_socket net/socket.c:1528 [inline] __x64_sys_socket+0x6f/0xb0 net/socket.c:1528 do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 entry_SYSCALL_64_after_hwframe+0x49/0xb3 RIP: 0033:0x441e29 Code: e8 fc b3 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffdce184148 EFLAGS: 00000246 ORIG_RAX: 0000000000000029 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000441e29 RDX: 0000000000000073 RSI: 0000000000000002 RDI: 0000000000000002 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000402c30 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: ---[ end trace 23b6578228ce553e ]--- RIP: 0010:inet_unhash+0x11f/0x770 net/ipv4/inet_hashtables.c:600 Code: 03 0f b6 04 02 84 c0 74 08 3c 03 0f 8e dd 04 00 00 48 8d 7d 08 44 8b 73 08 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 55 05 00 00 48 8d 7d 14 4c 8b 6d 08 48 b8 00 00 RSP: 0018:ffffc90001777d30 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff88809a6df940 RCX: ffffffff8697c242 RDX: 0000000000000001 RSI: ffffffff8697c251 RDI: 0000000000000008 RBP: 0000000000000000 R08: ffff88809f3ae1c0 R09: fffffbfff1514cc1 R10: ffffffff8a8a6607 R11: fffffbfff1514cc0 R12: ffff88809a6df9b0 R13: 0000000000000007 R14: 0000000000000000 R15: ffffffff873a4d00 FS: 0000000001d2b880(0000) GS:ffff8880ae600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000006cd090 CR3: 000000009403a000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 0d76751fad77 ("l2tp: Add L2TPv3 IP encapsulation (no UDP) support") Signed-off-by: Eric Dumazet Cc: James Chapman Cc: Andrii Nakryiko Reported-by: syzbot+3610d489778b57cc8031@syzkaller.appspotmail.com --- net/l2tp/l2tp_ip.c | 29 ++++++++++++++++++++++------- net/l2tp/l2tp_ip6.c | 30 ++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 0d7c887a2b75d..955662a6dee75 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -209,15 +208,31 @@ static int l2tp_ip_recv(struct sk_buff *skb) return 0; } -static int l2tp_ip_open(struct sock *sk) +static int l2tp_ip_hash(struct sock *sk) { - /* Prevent autobind. We don't have ports. */ - inet_sk(sk)->inet_num = IPPROTO_L2TP; + if (sk_unhashed(sk)) { + write_lock_bh(&l2tp_ip_lock); + sk_add_node(sk, &l2tp_ip_table); + write_unlock_bh(&l2tp_ip_lock); + } + return 0; +} +static void l2tp_ip_unhash(struct sock *sk) +{ + if (sk_unhashed(sk)) + return; write_lock_bh(&l2tp_ip_lock); - sk_add_node(sk, &l2tp_ip_table); + sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); +} + +static int l2tp_ip_open(struct sock *sk) +{ + /* Prevent autobind. We don't have ports. */ + inet_sk(sk)->inet_num = IPPROTO_L2TP; + l2tp_ip_hash(sk); return 0; } @@ -594,8 +609,8 @@ static struct proto l2tp_ip_prot = { .sendmsg = l2tp_ip_sendmsg, .recvmsg = l2tp_ip_recvmsg, .backlog_rcv = l2tp_ip_backlog_recv, - .hash = inet_hash, - .unhash = inet_unhash, + .hash = l2tp_ip_hash, + .unhash = l2tp_ip_unhash, .obj_size = sizeof(struct l2tp_ip_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index d148766f40d11..0fa694bd3f6a9 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -20,8 +20,6 @@ #include #include #include -#include -#include #include #include #include @@ -222,15 +220,31 @@ static int l2tp_ip6_recv(struct sk_buff *skb) return 0; } -static int l2tp_ip6_open(struct sock *sk) +static int l2tp_ip6_hash(struct sock *sk) { - /* Prevent autobind. We don't have ports. */ - inet_sk(sk)->inet_num = IPPROTO_L2TP; + if (sk_unhashed(sk)) { + write_lock_bh(&l2tp_ip6_lock); + sk_add_node(sk, &l2tp_ip6_table); + write_unlock_bh(&l2tp_ip6_lock); + } + return 0; +} +static void l2tp_ip6_unhash(struct sock *sk) +{ + if (sk_unhashed(sk)) + return; write_lock_bh(&l2tp_ip6_lock); - sk_add_node(sk, &l2tp_ip6_table); + sk_del_node_init(sk); write_unlock_bh(&l2tp_ip6_lock); +} + +static int l2tp_ip6_open(struct sock *sk) +{ + /* Prevent autobind. We don't have ports. */ + inet_sk(sk)->inet_num = IPPROTO_L2TP; + l2tp_ip6_hash(sk); return 0; } @@ -728,8 +742,8 @@ static struct proto l2tp_ip6_prot = { .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, - .hash = inet6_hash, - .unhash = inet_unhash, + .hash = l2tp_ip6_hash, + .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, From d9a81a225277686eb629938986d97629ea102633 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 May 2020 11:32:25 -0700 Subject: [PATCH 77/77] l2tp: add sk_family checks to l2tp_validate_socket syzbot was able to trigger a crash after using an ISDN socket and fool l2tp. Fix this by making sure the UDP socket is of the proper family. BUG: KASAN: slab-out-of-bounds in setup_udp_tunnel_sock+0x465/0x540 net/ipv4/udp_tunnel.c:78 Write of size 1 at addr ffff88808ed0c590 by task syz-executor.5/3018 CPU: 0 PID: 3018 Comm: syz-executor.5 Not tainted 5.7.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x188/0x20d lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd3/0x413 mm/kasan/report.c:382 __kasan_report.cold+0x20/0x38 mm/kasan/report.c:511 kasan_report+0x33/0x50 mm/kasan/common.c:625 setup_udp_tunnel_sock+0x465/0x540 net/ipv4/udp_tunnel.c:78 l2tp_tunnel_register+0xb15/0xdd0 net/l2tp/l2tp_core.c:1523 l2tp_nl_cmd_tunnel_create+0x4b2/0xa60 net/l2tp/l2tp_netlink.c:249 genl_family_rcv_msg_doit net/netlink/genetlink.c:673 [inline] genl_family_rcv_msg net/netlink/genetlink.c:718 [inline] genl_rcv_msg+0x627/0xdf0 net/netlink/genetlink.c:735 netlink_rcv_skb+0x15a/0x410 net/netlink/af_netlink.c:2469 genl_rcv+0x24/0x40 net/netlink/genetlink.c:746 netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] netlink_unicast+0x537/0x740 net/netlink/af_netlink.c:1329 netlink_sendmsg+0x882/0xe10 net/netlink/af_netlink.c:1918 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6e6/0x810 net/socket.c:2352 ___sys_sendmsg+0x100/0x170 net/socket.c:2406 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2439 do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 entry_SYSCALL_64_after_hwframe+0x49/0xb3 RIP: 0033:0x45ca29 Code: 0d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007effe76edc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004fe1c0 RCX: 000000000045ca29 RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000005 RBP: 000000000078bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 000000000000094e R14: 00000000004d5d00 R15: 00007effe76ee6d4 Allocated by task 3018: save_stack+0x1b/0x40 mm/kasan/common.c:49 set_track mm/kasan/common.c:57 [inline] __kasan_kmalloc mm/kasan/common.c:495 [inline] __kasan_kmalloc.constprop.0+0xbf/0xd0 mm/kasan/common.c:468 __do_kmalloc mm/slab.c:3656 [inline] __kmalloc+0x161/0x7a0 mm/slab.c:3665 kmalloc include/linux/slab.h:560 [inline] sk_prot_alloc+0x223/0x2f0 net/core/sock.c:1612 sk_alloc+0x36/0x1100 net/core/sock.c:1666 data_sock_create drivers/isdn/mISDN/socket.c:600 [inline] mISDN_sock_create+0x272/0x400 drivers/isdn/mISDN/socket.c:796 __sock_create+0x3cb/0x730 net/socket.c:1428 sock_create net/socket.c:1479 [inline] __sys_socket+0xef/0x200 net/socket.c:1521 __do_sys_socket net/socket.c:1530 [inline] __se_sys_socket net/socket.c:1528 [inline] __x64_sys_socket+0x6f/0xb0 net/socket.c:1528 do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 entry_SYSCALL_64_after_hwframe+0x49/0xb3 Freed by task 2484: save_stack+0x1b/0x40 mm/kasan/common.c:49 set_track mm/kasan/common.c:57 [inline] kasan_set_free_info mm/kasan/common.c:317 [inline] __kasan_slab_free+0xf7/0x140 mm/kasan/common.c:456 __cache_free mm/slab.c:3426 [inline] kfree+0x109/0x2b0 mm/slab.c:3757 kvfree+0x42/0x50 mm/util.c:603 __free_fdtable+0x2d/0x70 fs/file.c:31 put_files_struct fs/file.c:420 [inline] put_files_struct+0x248/0x2e0 fs/file.c:413 exit_files+0x7e/0xa0 fs/file.c:445 do_exit+0xb04/0x2dd0 kernel/exit.c:791 do_group_exit+0x125/0x340 kernel/exit.c:894 get_signal+0x47b/0x24e0 kernel/signal.c:2739 do_signal+0x81/0x2240 arch/x86/kernel/signal.c:784 exit_to_usermode_loop+0x26c/0x360 arch/x86/entry/common.c:161 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:279 [inline] do_syscall_64+0x6b1/0x7d0 arch/x86/entry/common.c:305 entry_SYSCALL_64_after_hwframe+0x49/0xb3 The buggy address belongs to the object at ffff88808ed0c000 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 1424 bytes inside of 2048-byte region [ffff88808ed0c000, ffff88808ed0c800) The buggy address belongs to the page: page:ffffea00023b4300 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0xfffe0000000200(slab) raw: 00fffe0000000200 ffffea0002838208 ffffea00015ba288 ffff8880aa000e00 raw: 0000000000000000 ffff88808ed0c000 0000000100000001 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88808ed0c480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff88808ed0c500: 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88808ed0c580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff88808ed0c600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88808ed0c680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc Fixes: 6b9f34239b00 ("l2tp: fix races in tunnel creation") Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") Signed-off-by: Eric Dumazet Cc: James Chapman Cc: Guillaume Nault Reported-by: syzbot Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index fcb53ed1c4fb9..6d7ef78c88af0 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1458,6 +1458,9 @@ static int l2tp_validate_socket(const struct sock *sk, const struct net *net, if (sk->sk_type != SOCK_DGRAM) return -EPROTONOSUPPORT; + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + return -EPROTONOSUPPORT; + if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) || (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP)) return -EPROTONOSUPPORT;