From b64acb28da8394485f0762e657470c9fc33aca4d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Jan 2021 12:36:42 +0100 Subject: [PATCH 01/45] ath9k: fix build error with LEDS_CLASS=m When CONFIG_ATH9K is built-in but LED support is in a loadable module, both ath9k drivers fails to link: x86_64-linux-ld: drivers/net/wireless/ath/ath9k/gpio.o: in function `ath_deinit_leds': gpio.c:(.text+0x36): undefined reference to `led_classdev_unregister' x86_64-linux-ld: drivers/net/wireless/ath/ath9k/gpio.o: in function `ath_init_leds': gpio.c:(.text+0x179): undefined reference to `led_classdev_register_ext' The problem is that the 'imply' keyword does not enforce any dependency but is only a weak hint to Kconfig to enable another symbol from a defconfig file. Change imply to a 'depends on LEDS_CLASS' that prevents the incorrect configuration but still allows building the driver without LED support. The 'select MAC80211_LEDS' is now ensures that the LED support is actually used if it is present, and the added Kconfig dependency on MAC80211_LEDS ensures that it cannot be enabled manually when it has no effect. Fixes: 197f466e93f5 ("ath9k_htc: Do not select MAC80211_LEDS by default") Signed-off-by: Arnd Bergmann Acked-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210125113654.2408057-1-arnd@kernel.org --- drivers/net/wireless/ath/ath9k/Kconfig | 8 ++------ net/mac80211/Kconfig | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/Kconfig b/drivers/net/wireless/ath/ath9k/Kconfig index a84bb9b6573f8..e150d82eddb6c 100644 --- a/drivers/net/wireless/ath/ath9k/Kconfig +++ b/drivers/net/wireless/ath/ath9k/Kconfig @@ -21,11 +21,9 @@ config ATH9K_BTCOEX_SUPPORT config ATH9K tristate "Atheros 802.11n wireless cards support" depends on MAC80211 && HAS_DMA + select MAC80211_LEDS if LEDS_CLASS=y || LEDS_CLASS=MAC80211 select ATH9K_HW select ATH9K_COMMON - imply NEW_LEDS - imply LEDS_CLASS - imply MAC80211_LEDS help This module adds support for wireless adapters based on Atheros IEEE 802.11n AR5008, AR9001 and AR9002 family @@ -176,11 +174,9 @@ config ATH9K_PCI_NO_EEPROM config ATH9K_HTC tristate "Atheros HTC based wireless cards support" depends on USB && MAC80211 + select MAC80211_LEDS if LEDS_CLASS=y || LEDS_CLASS=MAC80211 select ATH9K_HW select ATH9K_COMMON - imply NEW_LEDS - imply LEDS_CLASS - imply MAC80211_LEDS help Support for Atheros HTC based cards. Chipsets supported: AR9271 diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index cd9a9bd242bab..51ec8256b7fa9 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -69,7 +69,7 @@ config MAC80211_MESH config MAC80211_LEDS bool "Enable LED triggers" depends on MAC80211 - depends on LEDS_CLASS + depends on LEDS_CLASS=y || LEDS_CLASS=MAC80211 select LEDS_TRIGGERS help This option enables a few LED triggers for different From 93a1d4791c10d443bc67044def7efee2991d48b7 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 26 Jan 2021 12:02:13 +0100 Subject: [PATCH 02/45] mt76: dma: fix a possible memory leak in mt76_add_fragment() Fix a memory leak in mt76_add_fragment routine returning the buffer to the page_frag_cache when we receive a new fragment and the skb_shared_info frag array is full. Fixes: b102f0c522cf6 ("mt76: fix array overflow on receiving too many fragments for a packet") Signed-off-by: Lorenzo Bianconi Acked-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/4f9dd73407da88b2a552517ce8db242d86bf4d5c.1611616130.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/dma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 73eeb00d5aa64..e81dfaf99bcbf 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -509,15 +509,17 @@ static void mt76_add_fragment(struct mt76_dev *dev, struct mt76_queue *q, void *data, int len, bool more) { - struct page *page = virt_to_head_page(data); - int offset = data - page_address(page); struct sk_buff *skb = q->rx_head; struct skb_shared_info *shinfo = skb_shinfo(skb); if (shinfo->nr_frags < ARRAY_SIZE(shinfo->frags)) { - offset += q->buf_offset; + struct page *page = virt_to_head_page(data); + int offset = data - page_address(page) + q->buf_offset; + skb_add_rx_frag(skb, shinfo->nr_frags, page, offset, len, q->buf_size); + } else { + skb_free_frag(data); } if (more) From 548f1191d86ccb9bde2a5305988877b7584c01eb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Feb 2021 23:06:36 -0800 Subject: [PATCH 03/45] bpf: Unbreak BPF_PROG_TYPE_KPROBE when kprobe is called via do_int3 The commit 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()") converted do_int3 handler to be "NMI-like". That made old if (in_nmi()) check abort execution of bpf programs attached to kprobe when kprobe is firing via int3 (For example when kprobe is placed in the middle of the function). Remove the check to restore user visible behavior. Fixes: 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()") Reported-by: Nikolay Borisov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Nikolay Borisov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20210203070636.70926-1-alexei.starovoitov@gmail.com --- kernel/trace/bpf_trace.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c0018abe68a0..764400260eb60 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -96,9 +96,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; - if (in_nmi()) /* not supported yet */ - return 1; - cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { From 6183f4d3a0a2ad230511987c6c362ca43ec0055f Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 27 Jan 2021 06:36:53 +0000 Subject: [PATCH 04/45] bpf: Check for integer overflow when using roundup_pow_of_two() On 32-bit architecture, roundup_pow_of_two() can return 0 when the argument has upper most bit set due to resulting 1UL << 32. Add a check for this case. Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE") Signed-off-by: Bui Quang Minh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210127063653.3576-1-minhquangbui99@gmail.com --- kernel/bpf/stackmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aea96b6384734..bfafbf115bf30 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -115,6 +115,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ n_buckets = roundup_pow_of_two(attr->max_entries); + if (!n_buckets) + return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); From a4dc7eee9106a9d2a6e08b442db19677aa9699c7 Mon Sep 17 00:00:00 2001 From: Christoph Schemmel Date: Tue, 2 Feb 2021 09:45:23 +0100 Subject: [PATCH 05/45] NET: usb: qmi_wwan: Adding support for Cinterion MV31 Adding support for Cinterion MV31 with PID 0x00B7. T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 11 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=1e2d ProdID=00b7 Rev=04.14 S: Manufacturer=Cinterion S: Product=Cinterion USB Mobile Broadband S: SerialNumber=b3246eed C: #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#=0x1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option Signed-off-by: Christoph Schemmel Link: https://lore.kernel.org/r/20210202084523.4371-1-christoph.schemmel@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index cc4819282820b..5a05add9b4e69 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1309,6 +1309,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1e2d, 0x0082, 5)}, /* Cinterion PHxx,PXxx (2 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0083, 4)}, /* Cinterion PHxx,PXxx (1 RmNet + USB Audio)*/ {QMI_QUIRK_SET_DTR(0x1e2d, 0x00b0, 4)}, /* Cinterion CLS8 */ + {QMI_FIXED_INTF(0x1e2d, 0x00b7, 0)}, /* Cinterion MV31 RmNet */ {QMI_FIXED_INTF(0x413c, 0x81a2, 8)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a3, 8)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a4, 8)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ From b1bdde33b72366da20d10770ab7a49fe87b5e190 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 29 Jan 2021 20:57:43 +0100 Subject: [PATCH 06/45] netfilter: xt_recent: Fix attempt to update deleted entry When both --reap and --update flag are specified, there's a code path at which the entry to be updated is reaped beforehand, which then leads to kernel crash. Reap only entries which won't be updated. Fixes kernel bugzilla #207773. Link: https://bugzilla.kernel.org/show_bug.cgi?id=207773 Reported-by: Reindl Harald Fixes: 0079c5aee348 ("netfilter: xt_recent: add an entry reaper") Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_recent.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 606411869698e..0446307516cdf 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -152,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) /* * Drop entries with timestamps older then 'time'. */ -static void recent_entry_reap(struct recent_table *t, unsigned long time) +static void recent_entry_reap(struct recent_table *t, unsigned long time, + struct recent_entry *working, bool update) { struct recent_entry *e; @@ -161,6 +162,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time) */ e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + /* + * Do not reap the entry which are going to be updated. + */ + if (e == working && update) + return; + /* * The last time stamp is the most recent. */ @@ -303,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) /* info->seconds must be non-zero */ if (info->check_set & XT_RECENT_REAP) - recent_entry_reap(t, time); + recent_entry_reap(t, time, e, + info->check_set & XT_RECENT_UPDATE && ret); } if (info->check_set & XT_RECENT_SET || From a3005b0f83f217c888393c6bf9cd36e3d1616bca Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sat, 30 Jan 2021 11:14:25 +0100 Subject: [PATCH 07/45] selftests: netfilter: fix current year use date %Y instead of %G to read current year Problem appeared when running lkp-tests on 01/01/2021 Fixes: 48d072c4e8cd ("selftests: netfilter: add time counter check") Reported-by: kernel test robot Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_meta.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh index 087f0e6e71ce7..f33154c04d344 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/netfilter/nft_meta.sh @@ -23,7 +23,7 @@ ip -net "$ns0" addr add 127.0.0.1 dev lo trap cleanup EXIT -currentyear=$(date +%G) +currentyear=$(date +%Y) lastyear=$((currentyear-1)) ip netns exec "$ns0" nft -f /dev/stdin < Date: Tue, 2 Feb 2021 16:07:37 +0100 Subject: [PATCH 08/45] netfilter: nftables: fix possible UAF over chains from packet path in netns Although hooks are released via call_rcu(), chain and rule objects are immediately released while packets are still walking over these bits. This patch adds the .pre_exit callback which is invoked before synchronize_rcu() in the netns framework to stay safe. Remove a comment which is not valid anymore since the core does not use synchronize_net() anymore since 8c873e219970 ("netfilter: core: free hooks with call_rcu"). Suggested-by: Florian Westphal Fixes: df05ef874b28 ("netfilter: nf_tables: release objects on netns destruction") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8d3aa97b52e71..43fe80f10313c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8949,6 +8949,17 @@ int __nft_release_basechain(struct nft_ctx *ctx) } EXPORT_SYMBOL_GPL(__nft_release_basechain); +static void __nft_release_hooks(struct net *net) +{ + struct nft_table *table; + struct nft_chain *chain; + + list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(chain, &table->chains, list) + nf_tables_unregister_hook(net, table, chain); + } +} + static void __nft_release_tables(struct net *net) { struct nft_flowtable *flowtable, *nf; @@ -8964,10 +8975,6 @@ static void __nft_release_tables(struct net *net) list_for_each_entry_safe(table, nt, &net->nft.tables, list) { ctx.family = table->family; - - list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hook(net, table, chain); - /* No packets are walking on these chains anymore. */ ctx.table = table; list_for_each_entry(chain, &table->chains, list) { ctx.chain = chain; @@ -9016,6 +9023,11 @@ static int __net_init nf_tables_init_net(struct net *net) return 0; } +static void __net_exit nf_tables_pre_exit_net(struct net *net) +{ + __nft_release_hooks(net); +} + static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); @@ -9029,8 +9041,9 @@ static void __net_exit nf_tables_exit_net(struct net *net) } static struct pernet_operations nf_tables_net_ops = { - .init = nf_tables_init_net, - .exit = nf_tables_exit_net, + .init = nf_tables_init_net, + .pre_exit = nf_tables_pre_exit_net, + .exit = nf_tables_exit_net, }; static int __init nf_tables_module_init(void) From 8d6bca156e47d68551750a384b3ff49384c67be3 Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Tue, 2 Feb 2021 18:01:16 +0100 Subject: [PATCH 09/45] netfilter: flowtable: fix tcp and udp header checksum update When updating the tcp or udp header checksum on port nat the function inet_proto_csum_replace2 with the last parameter pseudohdr as true. This leads to an error in the case that GRO is used and packets are split up in GSO. The tcp or udp checksum of all packets is incorrect. The error is probably masked due to the fact the most network driver implement tcp/udp checksum offloading. It also only happens when GRO is applied and not on single packets. The error is most visible when using a pppoe connection which is not triggering the tcp/udp checksum offload. Fixes: ac2a66665e23 ("netfilter: add generic flow table infrastructure") Signed-off-by: Sven Auhagen Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 513f78db3cb2f..4a4acbba78ff7 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -399,7 +399,7 @@ static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, return -1; tcph = (void *)(skb_network_header(skb) + thoff); - inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); + inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); return 0; } @@ -415,7 +415,7 @@ static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, - new_port, true); + new_port, false); if (!udph->check) udph->check = CSUM_MANGLED_0; } From 2a80c15812372e554474b1dba0b1d8e467af295d Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Tue, 2 Feb 2021 15:20:59 +0600 Subject: [PATCH 10/45] net/qrtr: restrict user-controlled length in qrtr_tun_write_iter() syzbot found WARNING in qrtr_tun_write_iter [1] when write_iter length exceeds KMALLOC_MAX_SIZE causing order >= MAX_ORDER condition. Additionally, there is no check for 0 length write. [1] WARNING: mm/page_alloc.c:5011 [..] Call Trace: alloc_pages_current+0x18c/0x2a0 mm/mempolicy.c:2267 alloc_pages include/linux/gfp.h:547 [inline] kmalloc_order+0x2e/0xb0 mm/slab_common.c:837 kmalloc_order_trace+0x14/0x120 mm/slab_common.c:853 kmalloc include/linux/slab.h:557 [inline] kzalloc include/linux/slab.h:682 [inline] qrtr_tun_write_iter+0x8a/0x180 net/qrtr/tun.c:83 call_write_iter include/linux/fs.h:1901 [inline] Reported-by: syzbot+c2a7e5c5211605a90865@syzkaller.appspotmail.com Signed-off-by: Sabyrzhan Tasbolatov Link: https://lore.kernel.org/r/20210202092059.1361381-1-snovitoll@gmail.com Signed-off-by: Jakub Kicinski --- net/qrtr/tun.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index 15ce9b642b25f..b238c40a99842 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -80,6 +80,12 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; void *kbuf; + if (!len) + return -EINVAL; + + if (len > KMALLOC_MAX_SIZE) + return -ENOMEM; + kbuf = kzalloc(len, GFP_KERNEL); if (!kbuf) return -ENOMEM; From d795cc02a297df80910cf4ba23147680d15d8a7d Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 3 Feb 2021 23:37:14 +0300 Subject: [PATCH 11/45] selftests/tls: fix selftest with CHACHA20-POLY1305 TLS selftests were broken also because of use of structure that was not exported to UAPI. Fix by defining the union in tests. Fixes: 4f336e88a870 (selftests/tls: add CHACHA20-POLY1305 to tls selftests) Reported-by: Rong Chen Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/1612384634-5377-1-git-send-email-vfedorenko@novek.ru Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index e0088c2d38a5d..426d07875a48e 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -133,7 +133,10 @@ FIXTURE_VARIANT_ADD(tls, 13_chacha) FIXTURE_SETUP(tls) { - union tls_crypto_context tls12; + union { + struct tls12_crypto_info_aes_gcm_128 aes128; + struct tls12_crypto_info_chacha20_poly1305 chacha20; + } tls12; struct sockaddr_in addr; socklen_t len; int sfd, ret; @@ -143,14 +146,16 @@ FIXTURE_SETUP(tls) len = sizeof(addr); memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = variant->tls_version; - tls12.info.cipher_type = variant->cipher_type; switch (variant->cipher_type) { case TLS_CIPHER_CHACHA20_POLY1305: - tls12_sz = sizeof(tls12_crypto_info_chacha20_poly1305); + tls12_sz = sizeof(struct tls12_crypto_info_chacha20_poly1305); + tls12.chacha20.info.version = variant->tls_version; + tls12.chacha20.info.cipher_type = variant->cipher_type; break; case TLS_CIPHER_AES_GCM_128: - tls12_sz = sizeof(tls12_crypto_info_aes_gcm_128); + tls12_sz = sizeof(struct tls12_crypto_info_aes_gcm_128); + tls12.aes128.info.version = variant->tls_version; + tls12.aes128.info.cipher_type = variant->cipher_type; break; default: tls12_sz = 0; From ec7d8e7dd3a59528e305a18e93f1cb98f7faf83b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 2 Feb 2021 08:09:38 +0100 Subject: [PATCH 12/45] xen/netback: avoid race in xenvif_rx_ring_slots_available() Since commit 23025393dbeb3b8b3 ("xen/netback: use lateeoi irq binding") xenvif_rx_ring_slots_available() is no longer called only from the rx queue kernel thread, so it needs to access the rx queue with the associated queue held. Reported-by: Igor Druzhinin Fixes: 23025393dbeb3b8b3 ("xen/netback: use lateeoi irq binding") Signed-off-by: Juergen Gross Acked-by: Wei Liu Link: https://lore.kernel.org/r/20210202070938.7863-1-jgross@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/xen-netback/rx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c index b8febe1d1bfd3..accc991d153f7 100644 --- a/drivers/net/xen-netback/rx.c +++ b/drivers/net/xen-netback/rx.c @@ -38,10 +38,15 @@ static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) RING_IDX prod, cons; struct sk_buff *skb; int needed; + unsigned long flags; + + spin_lock_irqsave(&queue->rx_queue.lock, flags); skb = skb_peek(&queue->rx_queue); - if (!skb) + if (!skb) { + spin_unlock_irqrestore(&queue->rx_queue.lock, flags); return false; + } needed = DIV_ROUND_UP(skb->len, XEN_PAGE_SIZE); if (skb_is_gso(skb)) @@ -49,6 +54,8 @@ static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) if (skb->sw_hash) needed++; + spin_unlock_irqrestore(&queue->rx_queue.lock, flags); + do { prod = queue->rx.sring->req_prod; cons = queue->rx.req_cons; From 3401e4aa43a540881cc97190afead650e709c418 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Tue, 2 Feb 2021 23:55:11 +0530 Subject: [PATCH 13/45] cxgb4: Add new T6 PCI device id 0x6092 Signed-off-by: Raju Rangoju Link: https://lore.kernel.org/r/20210202182511.8109-1-rajur@chelsio.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h index 0c5373462cedb..0b1b5f9c67d47 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h @@ -219,6 +219,7 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN CH_PCI_ID_TABLE_FENTRY(0x6089), /* Custom T62100-KR */ CH_PCI_ID_TABLE_FENTRY(0x608a), /* Custom T62100-CR */ CH_PCI_ID_TABLE_FENTRY(0x608b), /* Custom T6225-CR */ + CH_PCI_ID_TABLE_FENTRY(0x6092), /* Custom T62100-CR-LOM */ CH_PCI_DEVICE_ID_TABLE_DEFINE_END; #endif /* __T4_PCI_ID_TBL_H__ */ From 7b5eab57cac45e270a0ad624ba157c5b30b3d44d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Feb 2021 08:47:56 +0000 Subject: [PATCH 14/45] rxrpc: Fix clearance of Tx/Rx ring when releasing a call At the end of rxrpc_release_call(), rxrpc_cleanup_ring() is called to clear the Rx/Tx skbuff ring, but this doesn't lock the ring whilst it's accessing it. Unfortunately, rxrpc_resend() might be trying to retransmit a packet concurrently with this - and whilst it does lock the ring, this isn't protection against rxrpc_cleanup_call(). Fix this by removing the call to rxrpc_cleanup_ring() from rxrpc_release_call(). rxrpc_cleanup_ring() will be called again anyway from rxrpc_cleanup_call(). The earlier call is just an optimisation to recycle skbuffs more quickly. Alternative solutions include rxrpc_release_call() could try to cancel the work item or wait for it to complete or rxrpc_cleanup_ring() could lock when accessing the ring (which would require a bh lock). This can produce a report like the following: BUG: KASAN: use-after-free in rxrpc_send_data_packet+0x19b4/0x1e70 net/rxrpc/output.c:372 Read of size 4 at addr ffff888011606e04 by task kworker/0:0/5 ... Workqueue: krxrpcd rxrpc_process_call Call Trace: ... kasan_report.cold+0x79/0xd5 mm/kasan/report.c:413 rxrpc_send_data_packet+0x19b4/0x1e70 net/rxrpc/output.c:372 rxrpc_resend net/rxrpc/call_event.c:266 [inline] rxrpc_process_call+0x1634/0x1f60 net/rxrpc/call_event.c:412 process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275 ... Allocated by task 2318: ... sock_alloc_send_pskb+0x793/0x920 net/core/sock.c:2348 rxrpc_send_data+0xb51/0x2bf0 net/rxrpc/sendmsg.c:358 rxrpc_do_sendmsg+0xc03/0x1350 net/rxrpc/sendmsg.c:744 rxrpc_sendmsg+0x420/0x630 net/rxrpc/af_rxrpc.c:560 ... Freed by task 2318: ... kfree_skb+0x140/0x3f0 net/core/skbuff.c:704 rxrpc_free_skb+0x11d/0x150 net/rxrpc/skbuff.c:78 rxrpc_cleanup_ring net/rxrpc/call_object.c:485 [inline] rxrpc_release_call+0x5dd/0x860 net/rxrpc/call_object.c:552 rxrpc_release_calls_on_socket+0x21c/0x300 net/rxrpc/call_object.c:579 rxrpc_release_sock net/rxrpc/af_rxrpc.c:885 [inline] rxrpc_release+0x263/0x5a0 net/rxrpc/af_rxrpc.c:916 __sock_release+0xcd/0x280 net/socket.c:597 ... The buggy address belongs to the object at ffff888011606dc0 which belongs to the cache skbuff_head_cache of size 232 Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Reported-by: syzbot+174de899852504e4a74a@syzkaller.appspotmail.com Reported-by: syzbot+3d1c772efafd3c38d007@syzkaller.appspotmail.com Signed-off-by: David Howells cc: Hillf Danton Link: https://lore.kernel.org/r/161234207610.653119.5287360098400436976.stgit@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/call_object.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c845594b663fb..4eb91d958a48d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -548,8 +548,6 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); - - rxrpc_cleanup_ring(call); _leave(""); } From 81b8be68ef8e8915d0cc6cedd2ac425c74a24813 Mon Sep 17 00:00:00 2001 From: Xie He Date: Tue, 2 Feb 2021 23:15:41 -0800 Subject: [PATCH 15/45] net: hdlc_x25: Return meaningful error code in x25_open It's not meaningful to pass on LAPB error codes to HDLC code or other parts of the system, because they will not understand the error codes. Instead, use system-wide recognizable error codes. Fixes: f362e5fe0f1f ("wan/hdlc_x25: make lapb params configurable") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xie He Acked-by: Martin Schiller Link: https://lore.kernel.org/r/20210203071541.86138-1-xie.he.0141@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/hdlc_x25.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c index bb164805804e8..4aaa6388b9ee0 100644 --- a/drivers/net/wan/hdlc_x25.c +++ b/drivers/net/wan/hdlc_x25.c @@ -169,11 +169,11 @@ static int x25_open(struct net_device *dev) result = lapb_register(dev, &cb); if (result != LAPB_OK) - return result; + return -ENOMEM; result = lapb_getparms(dev, ¶ms); if (result != LAPB_OK) - return result; + return -EINVAL; if (state(hdlc)->settings.dce) params.mode = params.mode | LAPB_DCE; @@ -188,7 +188,7 @@ static int x25_open(struct net_device *dev) result = lapb_setparms(dev, ¶ms); if (result != LAPB_OK) - return result; + return -EINVAL; return 0; } From 1d23a56b0296d29e7047b41fe0a42a001036160d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Feb 2021 19:06:55 -0600 Subject: [PATCH 16/45] net: ipa: set error code in gsi_channel_setup() In gsi_channel_setup(), we check to see if the configuration data contains any information about channels that are not supported by the hardware. If one is found, we abort the setup process, but the error code (ret) is not set in this case. Fix this bug. Fixes: 650d1603825d8 ("soc: qcom: ipa: the generic software interface") Reported-by: Dan Carpenter Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20210204010655.15619-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 34e5f2155d620..b77f5fef7aeca 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -1710,6 +1710,7 @@ static int gsi_channel_setup(struct gsi *gsi) if (!channel->gsi) continue; /* Ignore uninitialized channels */ + ret = -EINVAL; dev_err(gsi->dev, "channel %u not supported by hardware\n", channel_id - 1); channel_id = gsi->channel_count; From 52cbd23a119c6ebf40a527e53f3402d2ea38eccb Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 3 Feb 2021 14:29:52 -0500 Subject: [PATCH 17/45] udp: fix skb_copy_and_csum_datagram with odd segment sizes When iteratively computing a checksum with csum_block_add, track the offset "pos" to correctly rotate in csum_block_add when offset is odd. The open coded implementation of skb_copy_and_csum_datagram did this. With the switch to __skb_datagram_iter calling csum_and_copy_to_iter, pos was reinitialized to 0 on each call. Bring back the pos by passing it along with the csum to the callback. Changes v1->v2 - pass csum value, instead of csump pointer (Alexander Duyck) Link: https://lore.kernel.org/netdev/20210128152353.GB27281@optiplex/ Fixes: 950fcaecd5cc ("datagram: consolidate datagram copy to iter helpers") Reported-by: Oliver Graute Signed-off-by: Willem de Bruijn Reviewed-by: Alexander Duyck Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20210203192952.1849843-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/uio.h | 8 +++++++- lib/iov_iter.c | 24 ++++++++++++++---------- net/core/datagram.c | 12 ++++++++++-- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 72d88566694ee..27ff8eb786dc3 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -260,7 +260,13 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } -size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i); + +struct csum_state { + __wsum csum; + size_t off; +}; + +size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index a21e6a5792c5a..f0b2ccb1bb018 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -592,14 +592,15 @@ static __wsum csum_and_memcpy(void *to, const void *from, size_t len, } static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, - __wsum *csum, struct iov_iter *i) + struct csum_state *csstate, + struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; unsigned int p_mask = pipe->ring_size - 1; + __wsum sum = csstate->csum; + size_t off = csstate->off; unsigned int i_head; size_t n, r; - size_t off = 0; - __wsum sum = *csum; if (!sanity(i)) return 0; @@ -621,7 +622,8 @@ static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, i_head++; } while (n); i->count -= bytes; - *csum = sum; + csstate->csum = sum; + csstate->off = off; return bytes; } @@ -1522,18 +1524,19 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, } EXPORT_SYMBOL(csum_and_copy_from_iter_full); -size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, +size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, struct iov_iter *i) { + struct csum_state *csstate = _csstate; const char *from = addr; - __wsum *csum = csump; __wsum sum, next; - size_t off = 0; + size_t off; if (unlikely(iov_iter_is_pipe(i))) - return csum_and_copy_to_pipe_iter(addr, bytes, csum, i); + return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i); - sum = *csum; + sum = csstate->csum; + off = csstate->off; if (unlikely(iov_iter_is_discard(i))) { WARN_ON(1); /* for now */ return 0; @@ -1561,7 +1564,8 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, off += v.iov_len; }) ) - *csum = sum; + csstate->csum = sum; + csstate->off = off; return bytes; } EXPORT_SYMBOL(csum_and_copy_to_iter); diff --git a/net/core/datagram.c b/net/core/datagram.c index 81809fa735a78..15ab9ffb27fe9 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -721,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { - return __skb_datagram_iter(skb, offset, to, len, true, - csum_and_copy_to_iter, csump); + struct csum_state csdata = { .csum = *csump }; + int ret; + + ret = __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, &csdata); + if (ret) + return ret; + + *csump = csdata.csum; + return 0; } /** From 12bc8dfb83b5292fe387b795210018b7632ee08b Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Wed, 3 Feb 2021 12:36:02 +0100 Subject: [PATCH 18/45] hv_netvsc: Reset the RSC count if NVSP_STAT_FAIL in netvsc_receive() Commit 44144185951a0f ("hv_netvsc: Add validation for untrusted Hyper-V values") added validation to rndis_filter_receive_data() (and rndis_filter_receive()) which introduced NVSP_STAT_FAIL-scenarios where the count is not updated/reset. Fix this omission, and prevent similar scenarios from occurring in the future. Reported-by: Juan Vazquez Signed-off-by: Andrea Parri (Microsoft) Fixes: 44144185951a0f ("hv_netvsc: Add validation for untrusted Hyper-V values") Reviewed-by: Jesse Brandeburg Link: https://lore.kernel.org/r/20210203113602.558916-1-parri.andrea@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/hyperv/netvsc.c | 5 ++++- drivers/net/hyperv/rndis_filter.c | 2 -- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 2350342b961ff..13bd48a75db76 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1262,8 +1262,11 @@ static int netvsc_receive(struct net_device *ndev, ret = rndis_filter_receive(ndev, net_device, nvchan, data, buflen); - if (unlikely(ret != NVSP_STAT_SUCCESS)) + if (unlikely(ret != NVSP_STAT_SUCCESS)) { + /* Drop incomplete packet */ + nvchan->rsc.cnt = 0; status = NVSP_STAT_FAIL; + } } enq_receive_complete(ndev, net_device, q_idx, diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 598713c0d5a87..3aab2b867fc0d 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -509,8 +509,6 @@ static int rndis_filter_receive_data(struct net_device *ndev, return ret; drop: - /* Drop incomplete packet */ - nvchan->rsc.cnt = 0; return NVSP_STAT_FAIL; } From 07bf34a50e327975b21a9dee64d220c3dcb72ee9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Feb 2021 15:45:11 +0200 Subject: [PATCH 19/45] net: enetc: initialize the RFS and RSS memories Michael tried to enable Advanced Error Reporting through the ENETC's Root Complex Event Collector, and the system started spitting out single bit correctable ECC errors coming from the ENETC interfaces: pcieport 0000:00:1f.0: AER: Multiple Corrected error received: 0000:00:00.0 fsl_enetc 0000:00:00.0: PCIe Bus Error: severity=Corrected, type=Transaction Layer, (Receiver ID) fsl_enetc 0000:00:00.0: device [1957:e100] error status/mask=00004000/00000000 fsl_enetc 0000:00:00.0: [14] CorrIntErr fsl_enetc 0000:00:00.1: PCIe Bus Error: severity=Corrected, type=Transaction Layer, (Receiver ID) fsl_enetc 0000:00:00.1: device [1957:e100] error status/mask=00004000/00000000 fsl_enetc 0000:00:00.1: [14] CorrIntErr Further investigating the port correctable memory error detect register (PCMEDR) shows that these AER errors have an associated SOURCE_ID of 6 (RFS/RSS): $ devmem 0x1f8010e10 32 0xC0000006 $ devmem 0x1f8050e10 32 0xC0000006 Discussion with the hardware design engineers reveals that on LS1028A, the hardware does not do initialization of that RFS/RSS memory, and that software should clear/initialize the entire table before starting to operate. That comes as a bit of a surprise, since the driver does not do initialization of the RFS memory. Also, the initialization of the Receive Side Scaling is done only partially. Even though the entire ENETC IP has a single shared flow steering memory, the flow steering service should returns matches only for TCAM entries that are within the range of the Station Interface that is doing the search. Therefore, it should be sufficient for a Station Interface to initialize all of its own entries in order to avoid any ECC errors, and only the Station Interfaces in use should need initialization. There are Physical Station Interfaces associated with PCIe PFs and Virtual Station Interfaces associated with PCIe VFs. We let the PF driver initialize the entire port's memory, which includes the RFS entries which are going to be used by the VF. Reported-by: Michael Walle Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Reviewed-by: Jesse Brandeburg Link: https://lore.kernel.org/r/20210204134511.2640309-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/freescale/enetc/enetc_hw.h | 2 + .../net/ethernet/freescale/enetc/enetc_pf.c | 59 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index e1e950d48c92b..c71fe8d751d50 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -196,6 +196,8 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_CBS_BW_MASK GENMASK(6, 0) #define ENETC_PTCCBSR1(n) (0x1114 + (n) * 8) /* n = 0 to 7*/ #define ENETC_RSSHASH_KEY_SIZE 40 +#define ENETC_PRSSCAPR 0x1404 +#define ENETC_PRSSCAPR_GET_NUM_RSS(val) (BIT((val) & 0xf) * 32) #define ENETC_PRSSK(n) (0x1410 + (n) * 4) /* n = [0..9] */ #define ENETC_PSIVLANFMR 0x1700 #define ENETC_PSIVLANFMR_VS BIT(0) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index ed8fcb8b486eb..3eb5f1375bd4c 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -996,6 +996,51 @@ static void enetc_phylink_destroy(struct enetc_ndev_priv *priv) phylink_destroy(priv->phylink); } +/* Initialize the entire shared memory for the flow steering entries + * of this port (PF + VFs) + */ +static int enetc_init_port_rfs_memory(struct enetc_si *si) +{ + struct enetc_cmd_rfse rfse = {0}; + struct enetc_hw *hw = &si->hw; + int num_rfs, i, err = 0; + u32 val; + + val = enetc_port_rd(hw, ENETC_PRFSCAPR); + num_rfs = ENETC_PRFSCAPR_GET_NUM_RFS(val); + + for (i = 0; i < num_rfs; i++) { + err = enetc_set_fs_entry(si, &rfse, i); + if (err) + break; + } + + return err; +} + +static int enetc_init_port_rss_memory(struct enetc_si *si) +{ + struct enetc_hw *hw = &si->hw; + int num_rss, err; + int *rss_table; + u32 val; + + val = enetc_port_rd(hw, ENETC_PRSSCAPR); + num_rss = ENETC_PRSSCAPR_GET_NUM_RSS(val); + if (!num_rss) + return 0; + + rss_table = kcalloc(num_rss, sizeof(*rss_table), GFP_KERNEL); + if (!rss_table) + return -ENOMEM; + + err = enetc_set_rss_table(si, rss_table, num_rss); + + kfree(rss_table); + + return err; +} + static int enetc_pf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -1051,6 +1096,18 @@ static int enetc_pf_probe(struct pci_dev *pdev, goto err_alloc_si_res; } + err = enetc_init_port_rfs_memory(si); + if (err) { + dev_err(&pdev->dev, "Failed to initialize RFS memory\n"); + goto err_init_port_rfs; + } + + err = enetc_init_port_rss_memory(si); + if (err) { + dev_err(&pdev->dev, "Failed to initialize RSS memory\n"); + goto err_init_port_rss; + } + err = enetc_alloc_msix(priv); if (err) { dev_err(&pdev->dev, "MSIX alloc failed\n"); @@ -1079,6 +1136,8 @@ static int enetc_pf_probe(struct pci_dev *pdev, enetc_mdiobus_destroy(pf); err_mdiobus_create: enetc_free_msix(priv); +err_init_port_rss: +err_init_port_rfs: err_alloc_msix: enetc_free_si_resources(priv); err_alloc_si_res: From 8fd54a73b7cda11548154451bdb4bde6d8ff74c7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Feb 2021 18:33:51 +0200 Subject: [PATCH 20/45] net: dsa: call teardown method on probe failure Since teardown is supposed to undo the effects of the setup method, it should be called in the error path for dsa_switch_setup, not just in dsa_switch_teardown. Fixes: 5e3f847a02aa ("net: dsa: Add teardown callback for drivers") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20210204163351.2929670-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index a47e0f9b20d0a..a04fd637b4cdc 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -462,20 +462,23 @@ static int dsa_switch_setup(struct dsa_switch *ds) ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) { err = -ENOMEM; - goto unregister_notifier; + goto teardown; } dsa_slave_mii_bus_init(ds); err = mdiobus_register(ds->slave_mii_bus); if (err < 0) - goto unregister_notifier; + goto teardown; } ds->setup = true; return 0; +teardown: + if (ds->ops->teardown) + ds->ops->teardown(ds); unregister_notifier: dsa_switch_unregister_notifier(ds); unregister_devlink_ports: From 647b8dd5184665432cc8a2b5bca46a201f690c37 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 4 Feb 2021 20:50:34 +0300 Subject: [PATCH 21/45] selftests: txtimestamp: fix compilation issue PACKET_TX_TIMESTAMP is defined in if_packet.h but it is not included in test. Include it instead of otherwise the error of redefinition arrives. Also fix the compiler warning about ambiguous control flow by adding explicit braces. Fixes: 8fe2f761cae9 ("net-timestamp: expand documentation") Suggested-by: Willem de Bruijn Signed-off-by: Vadim Fedorenko Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/1612461034-24524-1-git-send-email-vfedorenko@novek.ru Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/txtimestamp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index 490a8cca708a8..fabb1d555ee5c 100644 --- a/tools/testing/selftests/net/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -495,12 +495,12 @@ static void do_test(int family, unsigned int report_opt) total_len = cfg_payload_len; if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) { total_len += sizeof(struct udphdr); - if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) + if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) { if (family == PF_INET) total_len += sizeof(struct iphdr); else total_len += sizeof(struct ipv6hdr); - + } /* special case, only rawv6_sendmsg: * pass proto in sin6_port if not connected * also see ANK comment in net/ipv4/raw.c From 8dc1c444df193701910f5e80b5d4caaf705a8fb0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Feb 2021 13:31:46 -0800 Subject: [PATCH 22/45] net: gro: do not keep too many GRO packets in napi->rx_list Commit c80794323e82 ("net: Fix packet reordering caused by GRO and listified RX cooperation") had the unfortunate effect of adding latencies in common workloads. Before the patch, GRO packets were immediately passed to upper stacks. After the patch, we can accumulate quite a lot of GRO packets (depdending on NAPI budget). My fix is counting in napi->rx_count number of segments instead of number of logical packets. Fixes: c80794323e82 ("net: Fix packet reordering caused by GRO and listified RX cooperation") Signed-off-by: Eric Dumazet Bisected-by: John Sperbeck Tested-by: Jian Yang Cc: Maxim Mikityanskiy Reviewed-by: Saeed Mahameed Reviewed-by: Edward Cree Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/r/20210204213146.4192368-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index a979b86dbacda..449b45b843d40 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5735,10 +5735,11 @@ static void gro_normal_list(struct napi_struct *napi) /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) + napi->rx_count += segs; + if (napi->rx_count >= gro_normal_batch) gro_normal_list(napi); } @@ -5777,7 +5778,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); return NET_RX_SUCCESS; } @@ -6067,7 +6068,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, 1); break; case GRO_DROP: @@ -6155,7 +6156,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, 1); break; case GRO_DROP: From 275a9c72b420e5051b0e92e49b26bef06c196f29 Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 4 Feb 2021 18:49:26 +0200 Subject: [PATCH 23/45] dpaa_eth: reserve space for the xdp_frame under the A050385 erratum When the erratum workaround is triggered, the newly created xdp_frame structure is stored at the start of the newly allocated buffer. Avoid the structure from being overwritten by explicitly reserving enough space in the buffer for storing it. Account for the fact that the structure's size might increase in time by aligning the headroom to DPAA_FD_DATA_ALIGNMENT bytes, thus guaranteeing the data's alignment. Fixes: ae680bcbd06a ("dpaa_eth: implement the A050385 erratum workaround for XDP") Signed-off-by: Camelia Groza Acked-by: Maciej Fijalkowski Acked-by: Madalin Bucur Signed-off-by: Jakub Kicinski --- .../net/ethernet/freescale/dpaa/dpaa_eth.c | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 4360ce4d3fb6a..f3a879937d8d8 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2182,6 +2182,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, struct xdp_frame *new_xdpf, *xdpf = *init_xdpf; void *new_buff; struct page *p; + int headroom; /* Check the data alignment and make sure the headroom is large * enough to store the xdpf backpointer. Use an aligned headroom @@ -2197,19 +2198,34 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, return 0; } + /* The new xdp_frame is stored in the new buffer. Reserve enough space + * in the headroom for storing it along with the driver's private + * info. The headroom needs to be aligned to DPAA_FD_DATA_ALIGNMENT to + * guarantee the data's alignment in the buffer. + */ + headroom = ALIGN(sizeof(*new_xdpf) + priv->tx_headroom, + DPAA_FD_DATA_ALIGNMENT); + + /* Assure the extended headroom and data don't overflow the buffer, + * while maintaining the mandatory tailroom. + */ + if (headroom + xdpf->len > DPAA_BP_RAW_SIZE - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + return -ENOMEM; + p = dev_alloc_pages(0); if (unlikely(!p)) return -ENOMEM; /* Copy the data to the new buffer at a properly aligned offset */ new_buff = page_address(p); - memcpy(new_buff + priv->tx_headroom, xdpf->data, xdpf->len); + memcpy(new_buff + headroom, xdpf->data, xdpf->len); /* Create an XDP frame around the new buffer in a similar fashion * to xdp_convert_buff_to_frame. */ new_xdpf = new_buff; - new_xdpf->data = new_buff + priv->tx_headroom; + new_xdpf->data = new_buff + headroom; new_xdpf->len = xdpf->len; new_xdpf->headroom = priv->tx_headroom; new_xdpf->frame_sz = DPAA_BP_RAW_SIZE; From c2b0e8455eb76135f505dda81a8869e60f37a861 Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 4 Feb 2021 18:49:27 +0200 Subject: [PATCH 24/45] dpaa_eth: reduce data alignment requirements for the A050385 erratum The 256 byte data alignment is required for preventing DMA transaction splits when crossing 4K page boundaries. Since XDP deals only with page sized buffers or less, this restriction isn't needed. Instead, the data only needs to be aligned to 64 bytes to prevent DMA transaction splits. These lessened restrictions can increase performance by widening the pool of permitted data alignments and preventing unnecessary realignments. Fixes: ae680bcbd06a ("dpaa_eth: implement the A050385 erratum workaround for XDP") Signed-off-by: Camelia Groza Acked-by: Maciej Fijalkowski Acked-by: Madalin Bucur Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index f3a879937d8d8..2a2c7db234078 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2192,7 +2192,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, * byte frame headroom. If the XDP program uses all of it, copy the * data to a new buffer and make room for storing the backpointer. */ - if (PTR_IS_ALIGNED(xdpf->data, DPAA_A050385_ALIGN) && + if (PTR_IS_ALIGNED(xdpf->data, DPAA_FD_DATA_ALIGNMENT) && xdpf->headroom >= priv->tx_headroom) { xdpf->headroom = priv->tx_headroom; return 0; From 0a9946cca1a30b7236a86757da9df2222eb73ee0 Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 4 Feb 2021 18:49:28 +0200 Subject: [PATCH 25/45] dpaa_eth: try to move the data in place for the A050385 erratum The XDP frame's headroom might be large enough to accommodate the xdpf backpointer as well as shifting the data to an aligned address. Try this first before resorting to allocating a new buffer and copying the data. Suggested-by: Maciej Fijalkowski Signed-off-by: Camelia Groza Acked-by: Maciej Fijalkowski Acked-by: Madalin Bucur Signed-off-by: Jakub Kicinski --- .../net/ethernet/freescale/dpaa/dpaa_eth.c | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 2a2c7db234078..6faa20bed4885 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2180,8 +2180,9 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, struct xdp_frame **init_xdpf) { struct xdp_frame *new_xdpf, *xdpf = *init_xdpf; - void *new_buff; + void *new_buff, *aligned_data; struct page *p; + u32 data_shift; int headroom; /* Check the data alignment and make sure the headroom is large @@ -2198,6 +2199,23 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, return 0; } + /* Try to move the data inside the buffer just enough to align it and + * store the xdpf backpointer. If the available headroom isn't large + * enough, resort to allocating a new buffer and copying the data. + */ + aligned_data = PTR_ALIGN_DOWN(xdpf->data, DPAA_FD_DATA_ALIGNMENT); + data_shift = xdpf->data - aligned_data; + + /* The XDP frame's headroom needs to be large enough to accommodate + * shifting the data as well as storing the xdpf backpointer. + */ + if (xdpf->headroom >= data_shift + priv->tx_headroom) { + memmove(aligned_data, xdpf->data, xdpf->len); + xdpf->data = aligned_data; + xdpf->headroom = priv->tx_headroom; + return 0; + } + /* The new xdp_frame is stored in the new buffer. Reserve enough space * in the headroom for storing it along with the driver's private * info. The headroom needs to be aligned to DPAA_FD_DATA_ALIGNMENT to From f317e2ea8c88737aa36228167b2292baef3f0430 Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Thu, 4 Feb 2021 22:03:16 +0800 Subject: [PATCH 26/45] net: stmmac: set TxQ mode back to DCB after disabling CBS When disable CBS, mode_to_use parameter is not updated even the operation mode of Tx Queue is changed to Data Centre Bridging (DCB). Therefore, when tc_setup_cbs() function is called to re-enable CBS, the operation mode of Tx Queue remains at DCB, which causing CBS fails to work. This patch updates the value of mode_to_use parameter to MTL_QUEUE_DCB after operation mode of Tx Queue is changed to DCB in stmmac_dma_qmode() callback function. Fixes: 1f705bc61aee ("net: stmmac: Add support for CBS QDISC") Suggested-by: Vinicius Costa Gomes Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: Song, Yoong Siang Reviewed-by: Jesse Brandeburg Acked-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/1612447396-20351-1-git-send-email-yoong.siang.song@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 8ed3b2c834a09..56985542e2029 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -324,7 +324,12 @@ static int tc_setup_cbs(struct stmmac_priv *priv, priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_AVB; } else if (!qopt->enable) { - return stmmac_dma_qmode(priv, priv->ioaddr, queue, MTL_QUEUE_DCB); + ret = stmmac_dma_qmode(priv, priv->ioaddr, queue, + MTL_QUEUE_DCB); + if (ret) + return ret; + + priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB; } /* Port Transmit Rate and Speed Divider */ From ef66a1eace968ff22a35f45e6e8ec36b668b6116 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 2 Feb 2021 21:08:02 -0800 Subject: [PATCH 27/45] ibmvnic: Clear failover_pending if unable to schedule Normally we clear the failover_pending flag when processing the reset. But if we are unable to schedule a failover reset we must clear the flag ourselves. We could fail to schedule the reset if we are in PROBING state (eg: when booting via kexec) or because we could not allocate memory. Thanks to Cris Forno for helping isolate the problem and for testing. Fixes: 1d8504937478 ("powerpc/vnic: Extend "failover pending" window") Signed-off-by: Sukadev Bhattiprolu Tested-by: Cristobal Forno Link: https://lore.kernel.org/r/20210203050802.680772-1-sukadev@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index f79034c786c84..a536fdbf05e19 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4918,7 +4918,22 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, complete(&adapter->init_done); adapter->init_done_rc = -EIO; } - ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); + rc = ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); + if (rc && rc != -EBUSY) { + /* We were unable to schedule the failover + * reset either because the adapter was still + * probing (eg: during kexec) or we could not + * allocate memory. Clear the failover_pending + * flag since no one else will. We ignore + * EBUSY because it means either FAILOVER reset + * is already scheduled or the adapter is + * being removed. + */ + netdev_err(netdev, + "Error %ld scheduling failover reset\n", + rc); + adapter->failover_pending = false; + } break; case IBMVNIC_CRQ_INIT_COMPLETE: dev_info(dev, "Partner initialization complete\n"); From 5d1cbcc990f18edaddddef26677073c4e6fad7b7 Mon Sep 17 00:00:00 2001 From: Norbert Slusarek Date: Fri, 5 Feb 2021 13:12:06 +0100 Subject: [PATCH 28/45] net/vmw_vsock: fix NULL pointer dereference In vsock_stream_connect(), a thread will enter schedule_timeout(). While being scheduled out, another thread can enter vsock_stream_connect() as well and set vsk->transport to NULL. In case a signal was sent, the first thread can leave schedule_timeout() and vsock_transport_cancel_pkt() will be called right after. Inside vsock_transport_cancel_pkt(), a null dereference will happen on transport->cancel_pkt. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Signed-off-by: Norbert Slusarek Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/trinity-c2d6cede-bfb1-44e2-85af-1fbc7f541715-1612535117028@3c-app-gmx-bap12 Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 6894f21dc1475..cb81cfb47a78c 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1233,7 +1233,7 @@ static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { const struct vsock_transport *transport = vsk->transport; - if (!transport->cancel_pkt) + if (!transport || !transport->cancel_pkt) return -EOPNOTSUPP; return transport->cancel_pkt(vsk); From 3d0bc44d39bca615b72637e340317b7899b7f911 Mon Sep 17 00:00:00 2001 From: Norbert Slusarek Date: Fri, 5 Feb 2021 13:14:05 +0100 Subject: [PATCH 29/45] net/vmw_vsock: improve locking in vsock_connect_timeout() A possible locking issue in vsock_connect_timeout() was recognized by Eric Dumazet which might cause a null pointer dereference in vsock_transport_cancel_pkt(). This patch assures that vsock_transport_cancel_pkt() will be called within the lock, so a race condition won't occur which could result in vsk->transport to be set to NULL. Fixes: 380feae0def7 ("vsock: cancel packets when failing to connect") Reported-by: Eric Dumazet Signed-off-by: Norbert Slusarek Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/trinity-f8e0937a-cf0e-4d80-a76e-d9a958ba3ef1-1612535522360@3c-app-gmx-bap12 Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index cb81cfb47a78c..4ea301fc2bf0d 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1243,7 +1243,6 @@ static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; - int cancel = 0; vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); @@ -1254,11 +1253,9 @@ static void vsock_connect_timeout(struct work_struct *work) sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); - cancel = 1; + vsock_transport_cancel_pkt(vsk); } release_sock(sk); - if (cancel) - vsock_transport_cancel_pkt(vsk); sock_put(sk); } From 225353c070fda18a23785e34e1eec2be508a3a3c Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Fri, 5 Feb 2021 21:51:14 +0200 Subject: [PATCH 30/45] net: ena: Update XDP verdict upon failure The verdict returned from ena_xdp_execute() is used to determine the fate of the RX buffer's page. In case of XDP Redirect/TX error the verdict should be set to XDP_ABORTED, otherwise the page won't be freed. Fixes: a318c70ad152 ("net: ena: introduce XDP redirect implementation") Signed-off-by: Shay Agroskin Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 06596fa1f9fea..a0596c073dddc 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -404,6 +404,7 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) if (unlikely(!xdpf)) { trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = XDP_ABORTED; break; } @@ -424,7 +425,10 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) xdp_stat = &rx_ring->rx_stats.xdp_redirect; break; } - fallthrough; + trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = XDP_ABORTED; + break; case XDP_ABORTED: trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_aborted; From af8085f3a4712c57d0dd415ad543bac85780375c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 5 Feb 2021 11:36:30 +1100 Subject: [PATCH 31/45] net: fix iteration for sctp transport seq_files The sctp transport seq_file iterators take a reference to the transport in the ->start and ->next functions and releases the reference in the ->show function. The preferred handling for such resources is to release them in the subsequent ->next or ->stop function call. Since Commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") there is no guarantee that ->show will be called after ->next, so this function can now leak references. So move the sctp_transport_put() call to ->next and ->stop. Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") Reported-by: Xin Long Signed-off-by: NeilBrown Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- net/sctp/proc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/sctp/proc.c b/net/sctp/proc.c index f7da88ae20a57..982a87b3e11f8 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -215,6 +215,12 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; + if (v && v != SEQ_START_TOKEN) { + struct sctp_transport *transport = v; + + sctp_transport_put(transport); + } + sctp_transport_walk_stop(&iter->hti); } @@ -222,6 +228,12 @@ static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; + if (v && v != SEQ_START_TOKEN) { + struct sctp_transport *transport = v; + + sctp_transport_put(transport); + } + ++*pos; return sctp_transport_get_next(seq_file_net(seq), &iter->hti); @@ -277,8 +289,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) sk->sk_rcvbuf); seq_printf(seq, "\n"); - sctp_transport_put(transport); - return 0; } @@ -354,8 +364,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - sctp_transport_put(transport); - return 0; } From ce7536bc7398e2ae552d2fabb7e0e371a9f1fe46 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 8 Feb 2021 15:44:54 +0100 Subject: [PATCH 32/45] vsock/virtio: update credit only if socket is not closed If the socket is closed or is being released, some resources used by virtio_transport_space_update() such as 'vsk->trans' may be released. To avoid a use after free bug we should only update the available credit when we are sure the socket is still open and we have the lock held. Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") Signed-off-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20210208144454.84438-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 5956939eebb78..e4370b1b74947 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1130,8 +1130,6 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, vsk = vsock_sk(sk); - space_available = virtio_transport_space_update(sk, pkt); - lock_sock(sk); /* Check if sk has been closed before lock_sock */ @@ -1142,6 +1140,8 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, goto free_pkt; } + space_available = virtio_transport_space_update(sk, pkt); + /* Update CID in case it has changed after a transport reset event */ vsk->local_addr.svm_cid = dst.svm_cid; From 07998281c268592963e1cd623fe6ab0270b65ae4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 5 Feb 2021 12:56:43 +0100 Subject: [PATCH 33/45] netfilter: conntrack: skip identical origin tuple in same zone only The origin skip check needs to re-test the zone. Else, we might skip a colliding tuple in the reply direction. This only occurs when using 'directional zones' where origin tuples reside in different zones but the reply tuples share the same zone. This causes the new conntrack entry to be dropped at confirmation time because NAT clash resolution was elided. Fixes: 4e35c1cb9460240 ("netfilter: nf_nat: skip nat clash resolution for same-origin entries") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 234b7cab37c30..ff0168736f6ea 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1229,7 +1229,8 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * Let nf_ct_resolve_clash() deal with this later. */ if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) continue; NF_CT_STAT_INC_ATOMIC(net, found); From 664899e85c1312e51d2761e7f8b2f25d053e8489 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 8 Feb 2021 13:20:47 +0100 Subject: [PATCH 34/45] netfilter: nftables: relax check for stateful expressions in set definition Restore the original behaviour where users are allowed to add an element with any stateful expression if the set definition specifies no stateful expressions. Make sure upper maximum number of stateful expressions of NFT_SET_EXPR_MAX is not reached. Fixes: 8cfd9b0f8515 ("netfilter: nftables: generalize set expressions support") Fixes: 48b0ae046ee9 ("netfilter: nftables: netlink support for several set element expressions") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 43fe80f10313c..8ee9f40cc0ea2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5281,6 +5281,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); + u32 flags = 0, size = 0, num_exprs = 0; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; @@ -5290,7 +5291,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u32 flags = 0, size = 0; u64 timeout; u64 expiration; int err, i; @@ -5356,7 +5356,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_EXPR]) { struct nft_expr *expr; - if (set->num_exprs != 1) + if (set->num_exprs && set->num_exprs != 1) return -EOPNOTSUPP; expr = nft_set_elem_expr_alloc(ctx, set, @@ -5365,8 +5365,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return PTR_ERR(expr); expr_array[0] = expr; + num_exprs = 1; - if (set->exprs[0] && set->exprs[0]->ops != expr->ops) { + if (set->num_exprs && set->exprs[0]->ops != expr->ops) { err = -EOPNOTSUPP; goto err_set_elem_expr; } @@ -5375,12 +5376,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nlattr *tmp; int left; - if (set->num_exprs == 0) - return -EOPNOTSUPP; - i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) { - if (i == set->num_exprs) { + if (i == NFT_SET_EXPR_MAX || + (set->num_exprs && set->num_exprs == i)) { err = -E2BIG; goto err_set_elem_expr; } @@ -5394,14 +5393,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_set_elem_expr; } expr_array[i] = expr; + num_exprs++; - if (expr->ops != set->exprs[i]->ops) { + if (set->num_exprs && expr->ops != set->exprs[i]->ops) { err = -EOPNOTSUPP; goto err_set_elem_expr; } i++; } - if (set->num_exprs != i) { + if (set->num_exprs && set->num_exprs != i) { err = -EOPNOTSUPP; goto err_set_elem_expr; } @@ -5409,6 +5409,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_set_elem_expr_clone(ctx, set, expr_array); if (err < 0) goto err_set_elem_expr_clone; + + num_exprs = set->num_exprs; } err = nft_setelem_parse_key(ctx, set, &elem.key.val, @@ -5433,8 +5435,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); } - if (set->num_exprs) { - for (i = 0; i < set->num_exprs; i++) + if (num_exprs) { + for (i = 0; i < num_exprs; i++) size += expr_array[i]->ops->size; nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS, @@ -5522,7 +5524,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } - for (i = 0; i < set->num_exprs; i++) + for (i = 0; i < num_exprs; i++) nft_set_elem_expr_setup(ext, i, expr_array); trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); @@ -5584,7 +5586,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); err_set_elem_expr: - for (i = 0; i < set->num_exprs && expr_array[i]; i++) + for (i = 0; i < num_exprs && expr_array[i]; i++) nft_expr_destroy(ctx, expr_array[i]); err_set_elem_expr_clone: return err; From 3aa6bce9af0e25b735c9c1263739a5639a336ae8 Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Fri, 5 Feb 2021 17:37:32 -0800 Subject: [PATCH 35/45] net: watchdog: hold device global xmit lock during tx disable Prevent netif_tx_disable() running concurrently with dev_watchdog() by taking the device global xmit lock. Otherwise, the recommended: netif_carrier_off(dev); netif_tx_disable(dev); driver shutdown sequence can happen after the watchdog has already checked carrier, resulting in possible false alarms. This is because netif_tx_lock() only sets the frozen bit without maintaining the locks on the individual queues. Fixes: c3f26a269c24 ("netdev: Fix lockdep warnings in multiqueue configurations.") Signed-off-by: Edwin Peer Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 259be67644e35..5ff27c12ce688 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4352,6 +4352,7 @@ static inline void netif_tx_disable(struct net_device *dev) local_bh_disable(); cpu = smp_processor_id(); + spin_lock(&dev->tx_global_lock); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); @@ -4359,6 +4360,7 @@ static inline void netif_tx_disable(struct net_device *dev) netif_tx_stop_queue(txq); __netif_tx_unlock(txq); } + spin_unlock(&dev->tx_global_lock); local_bh_enable(); } From b2bdba1cbc84cadb14393d0101a5bfd38d342e0a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 6 Feb 2021 22:47:33 +0100 Subject: [PATCH 36/45] bridge: mrp: Fix the usage of br_mrp_port_switchdev_set_state The function br_mrp_port_switchdev_set_state was called both with MRP port state and STP port state, which is an issue because they don't match exactly. Therefore, update the function to be used only with STP port state and use the id SWITCHDEV_ATTR_ID_PORT_STP_STATE. The choice of using STP over MRP is that the drivers already implement SWITCHDEV_ATTR_ID_PORT_STP_STATE and already in SW we update the port STP state. Fixes: 9a9f26e8f7ea30 ("bridge: mrp: Connect MRP API with the switchdev API") Fixes: fadd409136f0f2 ("bridge: switchdev: mrp: Implement MRP API for switchdev") Fixes: 2f1a11ae11d222 ("bridge: mrp: Add MRP interface.") Reported-by: Rasmus Villemoes Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp.c | 9 ++++++--- net/bridge/br_mrp_switchdev.c | 7 +++---- net/bridge/br_private_mrp.h | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index cec2c4e4561d0..5aeae6ad17b37 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -557,19 +557,22 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance) int br_mrp_set_port_state(struct net_bridge_port *p, enum br_mrp_port_state_type state) { + u32 port_state; + if (!p || !(p->flags & BR_MRP_AWARE)) return -EINVAL; spin_lock_bh(&p->br->lock); if (state == BR_MRP_PORT_STATE_FORWARDING) - p->state = BR_STATE_FORWARDING; + port_state = BR_STATE_FORWARDING; else - p->state = BR_STATE_BLOCKING; + port_state = BR_STATE_BLOCKING; + p->state = port_state; spin_unlock_bh(&p->br->lock); - br_mrp_port_switchdev_set_state(p, state); + br_mrp_port_switchdev_set_state(p, port_state); return 0; } diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index ed547e03ace17..75a7e8d0a2685 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -169,13 +169,12 @@ int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, return err; } -int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, - enum br_mrp_port_state_type state) +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state) { struct switchdev_attr attr = { .orig_dev = p->dev, - .id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE, - .u.mrp_port_state = state, + .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, + .u.stp_state = state, }; int err; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 32a48e5418dac..2514954c14316 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -72,8 +72,7 @@ int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, u32 interval, u8 max_miss, u32 period, bool monitor); -int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, - enum br_mrp_port_state_type state); +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, enum br_mrp_port_role_type role); int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, From 059d2a1004981dce19f0127dabc1b4ec927d202a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 6 Feb 2021 22:47:34 +0100 Subject: [PATCH 37/45] switchdev: mrp: Remove SWITCHDEV_ATTR_ID_MRP_PORT_STAT Now that MRP started to use also SWITCHDEV_ATTR_ID_PORT_STP_STATE to notify HW, then SWITCHDEV_ATTR_ID_MRP_PORT_STAT is not used anywhere else, therefore we can remove it. Fixes: c284b545900830 ("switchdev: mrp: Extend switchdev API to offload MRP") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 99cd538d65191..afdf8bd1b4fe5 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -42,7 +42,6 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, #if IS_ENABLED(CONFIG_BRIDGE_MRP) - SWITCHDEV_ATTR_ID_MRP_PORT_STATE, SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, #endif }; @@ -62,7 +61,6 @@ struct switchdev_attr { u16 vlan_protocol; /* BRIDGE_VLAN_PROTOCOL */ bool mc_disabled; /* MC_DISABLED */ #if IS_ENABLED(CONFIG_BRIDGE_MRP) - u8 mrp_port_state; /* MRP_PORT_STATE */ u8 mrp_port_role; /* MRP_PORT_ROLE */ #endif } u; From eb4733d7cffc547e08fe5a216e4f03663bb71108 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 8 Feb 2021 19:36:27 +0200 Subject: [PATCH 38/45] net: dsa: felix: implement port flushing on .phylink_mac_link_down There are several issues which may be seen when the link goes down while forwarding traffic, all of which can be attributed to the fact that the port flushing procedure from the reference manual was not closely followed. With flow control enabled on both the ingress port and the egress port, it may happen when a link goes down that Ethernet packets are in flight. In flow control mode, frames are held back and not dropped. When there is enough traffic in flight (example: iperf3 TCP), then the ingress port might enter congestion and never exit that state. This is a problem, because it is the egress port's link that went down, and that has caused the inability of the ingress port to send packets to any other port. This is solved by flushing the egress port's queues when it goes down. There is also a problem when performing stream splitting for IEEE 802.1CB traffic (not yet upstream, but a sort of multicast, basically). There, if one port from the destination ports mask goes down, splitting the stream towards the other destinations will no longer be performed. This can be traced down to this line: ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); which should have been instead, as per the reference manual: ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, DEV_MAC_ENA_CFG); Basically only DEV_MAC_ENA_CFG_RX_ENA should be disabled, but not DEV_MAC_ENA_CFG_TX_ENA - I don't have further insight into why that is the case, but apparently multicasting to several ports will cause issues if at least one of them doesn't have DEV_MAC_ENA_CFG_TX_ENA set. I am not sure what the state of the Ocelot VSC7514 driver is, but probably not as bad as Felix/Seville, since VSC7514 uses phylib and has the following in ocelot_adjust_link: if (!phydev->link) return; therefore the port is not really put down when the link is lost, unlike the DSA drivers which use .phylink_mac_link_down for that. Nonetheless, I put ocelot_port_flush() in the common ocelot.c because it needs to access some registers from drivers/net/ethernet/mscc/ocelot_rew.h which are not exported in include/soc/mscc/ and a bugfix patch should probably not move headers around. Fixes: bdeced75b13f ("net: dsa: felix: Add PCS operations for PHYLINK") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 17 ++++++++- drivers/net/ethernet/mscc/ocelot.c | 54 +++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_io.c | 8 ++++ include/soc/mscc/ocelot.h | 2 + 4 files changed, 80 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 7dc230677b78e..45fdb1256dbfe 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -233,9 +233,24 @@ static void felix_phylink_mac_link_down(struct dsa_switch *ds, int port, { struct ocelot *ocelot = ds->priv; struct ocelot_port *ocelot_port = ocelot->ports[port]; + int err; + + ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, + DEV_MAC_ENA_CFG); - ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); ocelot_fields_write(ocelot, port, QSYS_SWITCH_PORT_MODE_PORT_ENA, 0); + + err = ocelot_port_flush(ocelot, port); + if (err) + dev_err(ocelot->dev, "failed to flush port %d: %d\n", + port, err); + + /* Put the port in reset. */ + ocelot_port_writel(ocelot_port, + DEV_CLOCK_CFG_MAC_TX_RST | + DEV_CLOCK_CFG_MAC_RX_RST | + DEV_CLOCK_CFG_LINK_SPEED(OCELOT_SPEED_1000), + DEV_CLOCK_CFG); } static void felix_phylink_mac_link_up(struct dsa_switch *ds, int port, diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index ff87a0bc089cf..c072eb5c07646 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -375,6 +375,60 @@ static void ocelot_vlan_init(struct ocelot *ocelot) } } +static u32 ocelot_read_eq_avail(struct ocelot *ocelot, int port) +{ + return ocelot_read_rix(ocelot, QSYS_SW_STATUS, port); +} + +int ocelot_port_flush(struct ocelot *ocelot, int port) +{ + int err, val; + + /* Disable dequeuing from the egress queues */ + ocelot_rmw_rix(ocelot, QSYS_PORT_MODE_DEQUEUE_DIS, + QSYS_PORT_MODE_DEQUEUE_DIS, + QSYS_PORT_MODE, port); + + /* Disable flow control */ + ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, 0); + + /* Disable priority flow control */ + ocelot_fields_write(ocelot, port, + QSYS_SWITCH_PORT_MODE_TX_PFC_ENA, 0); + + /* Wait at least the time it takes to receive a frame of maximum length + * at the port. + * Worst-case delays for 10 kilobyte jumbo frames are: + * 8 ms on a 10M port + * 800 μs on a 100M port + * 80 μs on a 1G port + * 32 μs on a 2.5G port + */ + usleep_range(8000, 10000); + + /* Disable half duplex backpressure. */ + ocelot_rmw_rix(ocelot, 0, SYS_FRONT_PORT_MODE_HDX_MODE, + SYS_FRONT_PORT_MODE, port); + + /* Flush the queues associated with the port. */ + ocelot_rmw_gix(ocelot, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG_FLUSH_ENA, + REW_PORT_CFG, port); + + /* Enable dequeuing from the egress queues. */ + ocelot_rmw_rix(ocelot, 0, QSYS_PORT_MODE_DEQUEUE_DIS, QSYS_PORT_MODE, + port); + + /* Wait until flushing is complete. */ + err = read_poll_timeout(ocelot_read_eq_avail, val, !val, + 100, 2000000, false, ocelot, port); + + /* Clear flushing again. */ + ocelot_rmw_gix(ocelot, 0, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG, port); + + return err; +} +EXPORT_SYMBOL(ocelot_port_flush); + void ocelot_adjust_link(struct ocelot *ocelot, int port, struct phy_device *phydev) { diff --git a/drivers/net/ethernet/mscc/ocelot_io.c b/drivers/net/ethernet/mscc/ocelot_io.c index 0acb459484185..ea4e83410fe4d 100644 --- a/drivers/net/ethernet/mscc/ocelot_io.c +++ b/drivers/net/ethernet/mscc/ocelot_io.c @@ -71,6 +71,14 @@ void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg) } EXPORT_SYMBOL(ocelot_port_writel); +void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg) +{ + u32 cur = ocelot_port_readl(port, reg); + + ocelot_port_writel(port, (cur & (~mask)) | val, reg); +} +EXPORT_SYMBOL(ocelot_port_rmwl); + u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, u32 reg, u32 offset) { diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2f4cd3288bccc..c34b9ccb64722 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -709,6 +709,7 @@ struct ocelot_policer { /* I/O */ u32 ocelot_port_readl(struct ocelot_port *port, u32 reg); void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg); +void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg); u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset); void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset); void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, @@ -737,6 +738,7 @@ int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset); int ocelot_get_ts_info(struct ocelot *ocelot, int port, struct ethtool_ts_info *info); void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); +int ocelot_port_flush(struct ocelot *ocelot, int port); void ocelot_adjust_link(struct ocelot *ocelot, int port, struct phy_device *phydev); int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled, From 67a69f84cab60484f02eb8cbc7a76edffbb28a25 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 9 Feb 2021 17:03:05 +0800 Subject: [PATCH 39/45] net: hns3: add a check for queue_id in hclge_reset_vf_queue() The queue_id is received from vf, if use it directly, an out-of-bound issue may be caused, so add a check for this queue_id before using it in hclge_reset_vf_queue(). Fixes: 1a426f8b40fc ("net: hns3: fix the VF queue reset flow error") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c242883fea5db..48549db23c524 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9813,12 +9813,19 @@ int hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id) void hclge_reset_vf_queue(struct hclge_vport *vport, u16 queue_id) { + struct hnae3_handle *handle = &vport->nic; struct hclge_dev *hdev = vport->back; int reset_try_times = 0; int reset_status; u16 queue_gid; int ret; + if (queue_id >= handle->kinfo.num_tqps) { + dev_warn(&hdev->pdev->dev, "Invalid vf queue id(%u)\n", + queue_id); + return; + } + queue_gid = hclge_covert_handle_qid_global(&vport->nic, queue_id); ret = hclge_send_reset_tqp_cmd(hdev, queue_gid, true); From 326334aad024a60f46dc5e7dbe1efe32da3ca66f Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 9 Feb 2021 17:03:06 +0800 Subject: [PATCH 40/45] net: hns3: add a check for tqp_index in hclge_get_ring_chain_from_mbx() The tqp_index is received from vf, if use it directly, an out-of-bound issue may be caused, so add a check for this tqp_index before using it in hclge_get_ring_chain_from_mbx(). Fixes: 84e095d64ed9 ("net: hns3: Change PF to add ring-vect binding & resetQ to mailbox") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 754c09ada9019..ea2dea9902831 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -158,21 +158,31 @@ static int hclge_get_ring_chain_from_mbx( struct hclge_vport *vport) { struct hnae3_ring_chain_node *cur_chain, *new_chain; + struct hclge_dev *hdev = vport->back; int ring_num; - int i = 0; + int i; ring_num = req->msg.ring_num; if (ring_num > HCLGE_MBX_MAX_RING_CHAIN_PARAM_NUM) return -ENOMEM; + for (i = 0; i < ring_num; i++) { + if (req->msg.param[i].tqp_index >= vport->nic.kinfo.rss_size) { + dev_err(&hdev->pdev->dev, "tqp index(%u) is out of range(0-%u)\n", + req->msg.param[i].tqp_index, + vport->nic.kinfo.rss_size - 1); + return -EINVAL; + } + } + hnae3_set_bit(ring_chain->flag, HNAE3_RING_TYPE_B, - req->msg.param[i].ring_type); + req->msg.param[0].ring_type); ring_chain->tqp_index = hclge_get_queue_id(vport->nic.kinfo.tqp - [req->msg.param[i].tqp_index]); + [req->msg.param[0].tqp_index]); hnae3_set_field(ring_chain->int_gl_idx, HNAE3_RING_GL_IDX_M, - HNAE3_RING_GL_IDX_S, req->msg.param[i].int_gl_index); + HNAE3_RING_GL_IDX_S, req->msg.param[0].int_gl_index); cur_chain = ring_chain; From 532cfc0df1e4d68e74522ef4a0dcbf6ebbe68287 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 9 Feb 2021 17:03:07 +0800 Subject: [PATCH 41/45] net: hns3: add a check for index in hclge_get_rss_key() The index is received from vf, if use it directly, an out-of-bound issue may be caused, so add a check for this index before using it in hclge_get_rss_key(). Fixes: a638b1d8cc87 ("net: hns3: fix get VF RSS issue") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index ea2dea9902831..ffb416e088a97 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -607,6 +607,17 @@ static void hclge_get_rss_key(struct hclge_vport *vport, index = mbx_req->msg.data[0]; + /* Check the query index of rss_hash_key from VF, make sure no + * more than the size of rss_hash_key. + */ + if (((index + 1) * HCLGE_RSS_MBX_RESP_LEN) > + sizeof(vport[0].rss_hash_key)) { + dev_warn(&hdev->pdev->dev, + "failed to get the rss hash key, the index(%u) invalid !\n", + index); + return; + } + memcpy(resp_msg->data, &hdev->vport[0].rss_hash_key[index * HCLGE_RSS_MBX_RESP_LEN], HCLGE_RSS_MBX_RESP_LEN); From 1c5fae9c9a092574398a17facc31c533791ef232 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 9 Feb 2021 09:52:19 +0100 Subject: [PATCH 42/45] vsock: fix locking in vsock_shutdown() In vsock_shutdown() we touched some socket fields without holding the socket lock, such as 'state' and 'sk_flags'. Also, after the introduction of multi-transport, we are accessing 'vsk->transport' in vsock_send_shutdown() without holding the lock and this call can be made while the connection is in progress, so the transport can change in the meantime. To avoid issues, we hold the socket lock when we enter in vsock_shutdown() and release it when we leave. Among the transports that implement the 'shutdown' callback, only hyperv_transport acquired the lock. Since the caller now holds it, we no longer take it. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 8 +++++--- net/vmw_vsock/hyperv_transport.c | 4 ---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4ea301fc2bf0d..5546710d8ac1a 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -943,10 +943,12 @@ static int vsock_shutdown(struct socket *sock, int mode) */ sk = sock->sk; + + lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sk->sk_type == SOCK_STREAM) - return err; + goto out; } else { sock->state = SS_DISCONNECTING; err = 0; @@ -955,10 +957,8 @@ static int vsock_shutdown(struct socket *sock, int mode) /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { - lock_sock(sk); sk->sk_shutdown |= mode; sk->sk_state_change(sk); - release_sock(sk); if (sk->sk_type == SOCK_STREAM) { sock_reset_flag(sk, SOCK_DONE); @@ -966,6 +966,8 @@ static int vsock_shutdown(struct socket *sock, int mode) } } +out: + release_sock(sk); return err; } diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 630b851f8150f..cc3bae2659e79 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -474,14 +474,10 @@ static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode) static int hvs_shutdown(struct vsock_sock *vsk, int mode) { - struct sock *sk = sk_vsock(vsk); - if (!(mode & SEND_SHUTDOWN)) return 0; - lock_sock(sk); hvs_shutdown_lock_held(vsk->trans, mode); - release_sock(sk); return 0; } From ee114dd64c0071500345439fc79dd5e0f9d106ed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 5 Feb 2021 17:20:14 +0100 Subject: [PATCH 43/45] bpf: Fix verifier jsgt branch analysis on max bound Fix incorrect is_branch{32,64}_taken() analysis for the jsgt case. The return code for both will tell the caller whether a given conditional jump is taken or not, e.g. 1 means branch will be taken [for the involved registers] and the goto target will be executed, 0 means branch will not be taken and instead we fall-through to the next insn, and last but not least a -1 denotes that it is not known at verification time whether a branch will be taken or not. Now while the jsgt has the branch-taken case correct with reg->s32_min_value > sval, the branch-not-taken case is off-by-one when testing for reg->s32_max_value < sval since the branch will also be taken for reg->s32_max_value == sval. The jgt branch analysis, for example, gets this right. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Fixes: 4f7b3e82589e ("bpf: improve verifier branch analysis") Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7368c5eacb7b..67ea0ac3333c6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6877,7 +6877,7 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) case BPF_JSGT: if (reg->s32_min_value > sval) return 1; - else if (reg->s32_max_value < sval) + else if (reg->s32_max_value <= sval) return 0; break; case BPF_JLT: @@ -6950,7 +6950,7 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) case BPF_JSGT: if (reg->smin_value > sval) return 1; - else if (reg->smax_value < sval) + else if (reg->smax_value <= sval) return 0; break; case BPF_JLT: From fd675184fc7abfd1e1c52d23e8e900676b5a1c1a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 5 Feb 2021 20:48:21 +0100 Subject: [PATCH 44/45] bpf: Fix verifier jmp32 pruning decision logic Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes: func#0 @0 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 808464450 1: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (b4) w4 = 808464432 2: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP808464432 R10=fp0 2: (9c) w4 %= w0 3: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 3: (66) if w4 s> 0x30303030 goto pc+0 R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 4: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 4: (7f) r0 >>= r0 5: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 5: (9c) w4 %= w0 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 9: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 9: (95) exit propagating r0 from 6 to 7: safe 4: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umin_value=808464433,umax_value=2147483647,var_off=(0x0; 0x7fffffff)) R10=fp0 4: (7f) r0 >>= r0 5: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umin_value=808464433,umax_value=2147483647,var_off=(0x0; 0x7fffffff)) R10=fp0 5: (9c) w4 %= w0 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 propagating r0 7: safe propagating r0 from 6 to 7: safe processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1 The underlying program was xlated as follows: # bpftool p d x i 10 0: (b7) r0 = 808464450 1: (b4) w4 = 808464432 2: (bc) w0 = w0 3: (15) if r0 == 0x0 goto pc+1 4: (9c) w4 %= w0 5: (66) if w4 s> 0x30303030 goto pc+0 6: (7f) r0 >>= r0 7: (bc) w0 = w0 8: (15) if r0 == 0x0 goto pc+1 9: (9c) w4 %= w0 10: (66) if w0 s> 0x3030 goto pc+0 11: (d6) if w0 s<= 0x303030 goto pc+1 12: (05) goto pc-1 13: (95) exit The verifier rewrote original instructions it recognized as dead code with 'goto pc-1', but reality differs from verifier simulation in that we are actually able to trigger a hang due to hitting the 'goto pc-1' instructions. Taking a closer look at the verifier analysis, the reason is that it misjudges its pruning decision at the first 'from 6 to 7: safe' occasion. What happens is that while both old/cur registers are marked as precise, they get misjudged for the jmp32 case as range_within() yields true, meaning that the prior verification path with a wider register bound could be verified successfully and therefore the current path with a narrower register bound is deemed safe as well whereas in reality it's not. R0 old/cur path's bounds compare as follows: old: smin_value=0x8000000000000000,smax_value=0x7fffffffffffffff,umin_value=0x0,umax_value=0xffffffffffffffff,var_off=(0x0; 0xffffffffffffffff) cur: smin_value=0x8000000000000000,smax_value=0x7fffffff7fffffff,umin_value=0x0,umax_value=0xffffffff7fffffff,var_off=(0x0; 0xffffffff7fffffff) old: s32_min_value=0x80000000,s32_max_value=0x00003030,u32_min_value=0x00000000,u32_max_value=0xffffffff cur: s32_min_value=0x00003031,s32_max_value=0x7fffffff,u32_min_value=0x00003031,u32_max_value=0x7fffffff The 64 bit bounds generally look okay and while the information that got propagated from 32 to 64 bit looks correct as well, it's not precise enough for judging a conditional jmp32. Given the latter only operates on subregisters we also need to take these into account as well for a range_within() probe in order to be able to prune paths. Extending the range_within() constraint to both bounds will be able to tell us that the old signed 32 bit bounds are not wider than the cur signed 32 bit bounds. With the fix in place, the program will now verify the 'goto' branch case as it should have been: [...] 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 9: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 9: (95) exit 7: R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=12337,u32_min_value=12337,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=3158065,u32_min_value=3158065,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 8: R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=3158065,u32_min_value=3158065,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 8: (30) r0 = *(u8 *)skb[808464432] BPF_LD_[ABS|IND] uses reserved fields processed 11 insns (limit 1000000) max_states_per_insn 1 total_states 1 peak_states 1 mark_read 1 The bug is quite subtle in the sense that when verifier would determine that a given branch is dead code, it would (here: wrongly) remove these instructions from the program and hard-wire the taken branch for privileged programs instead of the 'goto pc-1' rewrites which will cause hard to debug problems. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 67ea0ac3333c6..24c0e9224f3c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8590,7 +8590,11 @@ static bool range_within(struct bpf_reg_state *old, return old->umin_value <= cur->umin_value && old->umax_value >= cur->umax_value && old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value; + old->smax_value >= cur->smax_value && + old->u32_min_value <= cur->u32_min_value && + old->u32_max_value >= cur->u32_max_value && + old->s32_min_value <= cur->s32_min_value && + old->s32_max_value >= cur->s32_max_value; } /* Maximum number of register states that can exist at once */ From e88b2c6e5a4d9ce30d75391e4d950da74bb2bd90 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Feb 2021 18:46:10 +0000 Subject: [PATCH 45/45] bpf: Fix 32 bit src register truncation on div/mod While reviewing a different fix, John and I noticed an oddity in one of the BPF program dumps that stood out, for example: # bpftool p d x i 13 0: (b7) r0 = 808464450 1: (b4) w4 = 808464432 2: (bc) w0 = w0 3: (15) if r0 == 0x0 goto pc+1 4: (9c) w4 %= w0 [...] In line 2 we noticed that the mov32 would 32 bit truncate the original src register for the div/mod operation. While for the two operations the dst register is typically marked unknown e.g. from adjust_scalar_min_max_vals() the src register is not, and thus verifier keeps tracking original bounds, simplified: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = -1 1: R0_w=invP-1 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (b7) r1 = -1 2: R0_w=invP-1 R1_w=invP-1 R10=fp0 2: (3c) w0 /= w1 3: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP-1 R10=fp0 3: (77) r1 >>= 32 4: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP4294967295 R10=fp0 4: (bf) r0 = r1 5: R0_w=invP4294967295 R1_w=invP4294967295 R10=fp0 5: (95) exit processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 Runtime result of r0 at exit is 0 instead of expected -1. Remove the verifier mov32 src rewrite in div/mod and replace it with a jmp32 test instead. After the fix, we result in the following code generation when having dividend r1 and divisor r6: div, 64 bit: div, 32 bit: 0: (b7) r6 = 8 0: (b7) r6 = 8 1: (b7) r1 = 8 1: (b7) r1 = 8 2: (55) if r6 != 0x0 goto pc+2 2: (56) if w6 != 0x0 goto pc+2 3: (ac) w1 ^= w1 3: (ac) w1 ^= w1 4: (05) goto pc+1 4: (05) goto pc+1 5: (3f) r1 /= r6 5: (3c) w1 /= w6 6: (b7) r0 = 0 6: (b7) r0 = 0 7: (95) exit 7: (95) exit mod, 64 bit: mod, 32 bit: 0: (b7) r6 = 8 0: (b7) r6 = 8 1: (b7) r1 = 8 1: (b7) r1 = 8 2: (15) if r6 == 0x0 goto pc+1 2: (16) if w6 == 0x0 goto pc+1 3: (9f) r1 %= r6 3: (9c) w1 %= w6 4: (b7) r0 = 0 4: (b7) r0 = 0 5: (95) exit 5: (95) exit x86 in particular can throw a 'divide error' exception for div instruction not only for divisor being zero, but also for the case when the quotient is too large for the designated register. For the edx:eax and rdx:rax dividend pair it is not an issue in x86 BPF JIT since we always zero edx (rdx). Hence really the only protection needed is against divisor being zero. Fixes: 68fda450a7df ("bpf: fix 32-bit divide by zero") Co-developed-by: John Fastabend Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 24c0e9224f3c5..37581919e050c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11003,30 +11003,28 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - struct bpf_insn mask_and_div[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), + bool isdiv = BPF_OP(insn->code) == BPF_DIV; + struct bpf_insn *patchlet; + struct bpf_insn chk_and_div[] = { /* Rx div 0 -> 0 */ - BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0), BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), BPF_JMP_IMM(BPF_JA, 0, 0, 1), *insn, }; - struct bpf_insn mask_and_mod[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), + struct bpf_insn chk_and_mod[] = { /* Rx mod 0 -> Rx */ - BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1, 0), *insn, }; - struct bpf_insn *patchlet; - if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || - insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - patchlet = mask_and_div + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); - } else { - patchlet = mask_and_mod + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); - } + patchlet = isdiv ? chk_and_div : chk_and_mod; + cnt = isdiv ? ARRAY_SIZE(chk_and_div) : + ARRAY_SIZE(chk_and_mod); new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog)