From 548b0c5de7619ef53bbde5590700693f2f6d2a56 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 2 Feb 2025 17:04:13 +0100 Subject: [PATCH 01/33] batman-adv: Ignore own maximum aggregation size during RX An OGMv1 and OGMv2 packet receive processing were not only limited by the number of bytes in the received packet but also by the nodes maximum aggregation packet size limit. But this limit is relevant for TX and not for RX. It must not be enforced by batadv_(i)v_ogm_aggr_packet to avoid loss of information in case of a different limit for sender and receiver. This has a minor side effect for B.A.T.M.A.N. IV because the batadv_iv_ogm_aggr_packet is also used for the preprocessing for the TX. But since the aggregation code itself will not allow more than BATADV_MAX_AGGREGATION_BYTES bytes, this check was never triggering (in this context) prior of removing it. Cc: stable@vger.kernel.org Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Fixes: 9323158ef9f4 ("batman-adv: OGMv2 - implement originators logic") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 3 +-- net/batman-adv/bat_v_ogm.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 07ae5dd1f150b..b12645949ae5a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -325,8 +325,7 @@ batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /* send a batman ogm to a given interface */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e503ee0d896bd..8f89ffe6020ce 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -839,8 +839,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /** From 5eddd76ec2fd1988f0a3450fde9730b10dd22992 Mon Sep 17 00:00:00 2001 From: Alexandre Cassen Date: Wed, 19 Feb 2025 12:20:37 +0200 Subject: [PATCH 02/33] xfrm: fix tunnel mode TX datapath in packet offload mode Packets that match the output xfrm policy are delivered to the netstack. In IPsec packet mode for tunnel mode, the HW is responsible for building the hard header and outer IP header. In such a situation, the inner header may refer to a network that is not directly reachable by the host, resulting in a failed neighbor resolution. The packet is then dropped. xfrm policy defines the netdevice to use for xmit so we can send packets directly to it. Makes direct xmit exclusive to tunnel mode, since some rules may apply in transport mode. Fixes: f8a70afafc17 ("xfrm: add TX datapath support for IPsec packet offload mode") Signed-off-by: Alexandre Cassen Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index f7abd42c077d5..e5b3e343a5ec1 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -612,6 +612,40 @@ int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err) } EXPORT_SYMBOL_GPL(xfrm_output_resume); +static int xfrm_dev_direct_output(struct sock *sk, struct xfrm_state *x, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net *net = xs_net(x); + int err; + + dst = skb_dst_pop(skb); + if (!dst) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -EHOSTUNREACH; + } + skb_dst_set(skb, dst); + nf_reset_ct(skb); + + err = skb_dst(skb)->ops->local_out(net, sk, skb); + if (unlikely(err != 1)) { + kfree_skb(skb); + return err; + } + + /* In transport mode, network destination is + * directly reachable, while in tunnel mode, + * inner packet network may not be. In packet + * offload type, HW is responsible for hard + * header packet mangling so directly xmit skb + * to netdevice. + */ + skb->dev = x->xso.dev; + __skb_push(skb, skb->dev->hard_header_len); + return dev_queue_xmit(skb); +} + static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { return xfrm_output_resume(sk, skb, 1); @@ -735,6 +769,13 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) return -EHOSTUNREACH; } + /* Exclusive direct xmit for tunnel mode, as + * some filtering or matching rules may apply + * in transport mode. + */ + if (x->props.mode == XFRM_MODE_TUNNEL) + return xfrm_dev_direct_output(sk, x, skb); + return xfrm_output_resume(sk, skb, 0); } From 0aae2867aa6067f73d066bc98385e23c8454a1d7 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 19 Feb 2025 12:52:48 +0200 Subject: [PATCH 03/33] xfrm_output: Force software GSO only in tunnel mode The cited commit fixed a software GSO bug with VXLAN + IPSec in tunnel mode. Unfortunately, it is slightly broader than necessary, as it also severely affects performance for Geneve + IPSec transport mode over a device capable of both HW GSO and IPSec crypto offload. In this case, xfrm_output unnecessarily triggers software GSO instead of letting the HW do it. In simple iperf3 tests over Geneve + IPSec transport mode over a back-2-back pair of NICs with MTU 1500, the performance was observed to be up to 6x worse when doing software GSO compared to leaving it to the hardware. This commit makes xfrm_output only trigger software GSO in crypto offload cases for already encapsulated packets in tunnel mode, as not doing so would then cause the inner tunnel skb->inner_networking_header to be overwritten and break software GSO for that packet later if the device turns out to not be capable of HW GSO. Taking a closer look at the conditions for the original bug, to better understand the reasons for this change: - vxlan_build_skb -> iptunnel_handle_offloads sets inner_protocol and inner network header. - then, udp_tunnel_xmit_skb -> ip_tunnel_xmit adds outer transport and network headers. - later in the xmit path, xfrm_output -> xfrm_outer_mode_output -> xfrm4_prepare_output -> xfrm4_tunnel_encap_add overwrites the inner network header with the one set in ip_tunnel_xmit before adding the second outer header. - __dev_queue_xmit -> validate_xmit_skb checks whether GSO segmentation needs to happen based on dev features. In the original bug, the hw couldn't segment the packets, so skb_gso_segment was invoked. - deep in the .gso_segment callback machinery, __skb_udp_tunnel_segment tries to use the wrong inner network header, expecting the one set in iptunnel_handle_offloads but getting the one set by xfrm instead. - a bit later, ipv6_gso_segment accesses the wrong memory based on that wrong inner network header. With the new change, the original bug (or similar ones) cannot happen again, as xfrm will now trigger software GSO before applying a tunnel. This concern doesn't exist in packet offload mode, when the HW adds encapsulation headers. For the non-offloaded packets (crypto in SW), software GSO is still done unconditionally in the else branch. Reviewed-by: Dragos Tatulea Reviewed-by: Yael Chemla Reviewed-by: Leon Romanovsky Fixes: a204aef9fd77 ("xfrm: call xfrm_output_gso when inner_protocol is set in xfrm_output") Signed-off-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e5b3e343a5ec1..3cabc87978dd1 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -799,7 +799,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) skb->encapsulation = 1; if (skb_is_gso(skb)) { - if (skb->inner_protocol) + if (skb->inner_protocol && x->props.mode == XFRM_MODE_TUNNEL) return xfrm_output_gso(net, sk, skb); skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; From 72d061ee630d0dbb45c2920d8d19b3861c413e54 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Mar 2025 22:46:56 +0300 Subject: [PATCH 04/33] Bluetooth: Fix error code in chan_alloc_skb_cb() The chan_alloc_skb_cb() function is supposed to return error pointers on error. Returning NULL will lead to a NULL dereference. Fixes: 6b8d4a6a0314 ("Bluetooth: 6LoWPAN: Use connected oriented channel instead of fixed one") Signed-off-by: Dan Carpenter Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/6lowpan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 50cfec8ccac4f..3c29778171c58 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -825,11 +825,16 @@ static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { + struct sk_buff *skb; + /* Note that we must allocate using GFP_ATOMIC here as * this function is called originally from netdev hard xmit * function in atomic context. */ - return bt_skb_alloc(hdr_len + len, GFP_ATOMIC); + skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + return skb; } static void chan_suspend_cb(struct l2cap_chan *chan) From f6685a96c8c8a07e260e39bac86d4163cfb38a4d Mon Sep 17 00:00:00 2001 From: Arkadiusz Bokowy Date: Wed, 12 Mar 2025 20:09:43 +0100 Subject: [PATCH 05/33] Bluetooth: hci_event: Fix connection regression between LE and non-LE adapters Due to a typo during defining HCI errors it is not possible to connect LE-capable device with BR/EDR only adapter. The connection is terminated by the LE adapter because the invalid LL params error code is treated as unsupported remote feature. Fixes: 79c0868ad65a ("Bluetooth: hci_event: Use HCI error defines instead of magic values") Signed-off-by: Arkadiusz Bokowy Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 0d51970d809fd..3ec915738112b 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -683,7 +683,7 @@ enum { #define HCI_ERROR_REMOTE_POWER_OFF 0x15 #define HCI_ERROR_LOCAL_HOST_TERM 0x16 #define HCI_ERROR_PAIRING_NOT_ALLOWED 0x18 -#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE 0x1e +#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE 0x1a #define HCI_ERROR_INVALID_LL_PARAMS 0x1e #define HCI_ERROR_UNSPECIFIED 0x1f #define HCI_ERROR_ADVERTISING_TIMEOUT 0x3c From 1d22a122ffb116c3cf78053e812b8b21f8852ee9 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 18 Feb 2025 23:32:28 +0900 Subject: [PATCH 06/33] can: ucan: fix out of bound read in strscpy() source Commit 7fdaf8966aae ("can: ucan: use strscpy() to instead of strncpy()") unintentionally introduced a one byte out of bound read on strscpy()'s source argument (which is kind of ironic knowing that strscpy() is meant to be a more secure alternative :)). Let's consider below buffers: dest[len + 1]; /* will be NUL terminated */ src[len]; /* may not be NUL terminated */ When doing: strncpy(dest, src, len); dest[len] = '\0'; strncpy() will read up to len bytes from src. On the other hand: strscpy(dest, src, len + 1); will read up to len + 1 bytes from src, that is to say, an out of bound read of one byte will occur on src if it is not NUL terminated. Note that the src[len] byte is never copied, but strscpy() still needs to read it to check whether a truncation occurred or not. This exact pattern happened in ucan. The root cause is that the source is not NUL terminated. Instead of doing a copy in a local buffer, directly NUL terminate it as soon as usb_control_msg() returns. With this, the local firmware_str[] variable can be removed. On top of this do a couple refactors: - ucan_ctl_payload->raw is only used for the firmware string, so rename it to ucan_ctl_payload->fw_str and change its type from u8 to char. - ucan_device_request_in() is only used to retrieve the firmware string, so rename it to ucan_get_fw_str() and refactor it to make it directly handle all the string termination logic. Reported-by: syzbot+d7d8c418e8317899e88c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-can/67b323a4.050a0220.173698.002b.GAE@google.com/ Fixes: 7fdaf8966aae ("can: ucan: use strscpy() to instead of strncpy()") Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250218143515.627682-2-mailhol.vincent@wanadoo.fr Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/ucan.c | 43 ++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c index 39a63b7313a46..07406daf7c88e 100644 --- a/drivers/net/can/usb/ucan.c +++ b/drivers/net/can/usb/ucan.c @@ -186,7 +186,7 @@ union ucan_ctl_payload { */ struct ucan_ctl_cmd_get_protocol_version cmd_get_protocol_version; - u8 raw[128]; + u8 fw_str[128]; } __packed; enum { @@ -424,18 +424,20 @@ static int ucan_ctrl_command_out(struct ucan_priv *up, UCAN_USB_CTL_PIPE_TIMEOUT); } -static int ucan_device_request_in(struct ucan_priv *up, - u8 cmd, u16 subcmd, u16 datalen) +static void ucan_get_fw_str(struct ucan_priv *up, char *fw_str, size_t size) { - return usb_control_msg(up->udev, - usb_rcvctrlpipe(up->udev, 0), - cmd, - USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - subcmd, - 0, - up->ctl_msg_buffer, - datalen, - UCAN_USB_CTL_PIPE_TIMEOUT); + int ret; + + ret = usb_control_msg(up->udev, usb_rcvctrlpipe(up->udev, 0), + UCAN_DEVICE_GET_FW_STRING, + USB_DIR_IN | USB_TYPE_VENDOR | + USB_RECIP_DEVICE, + 0, 0, fw_str, size - 1, + UCAN_USB_CTL_PIPE_TIMEOUT); + if (ret > 0) + fw_str[ret] = '\0'; + else + strscpy(fw_str, "unknown", size); } /* Parse the device information structure reported by the device and @@ -1314,7 +1316,6 @@ static int ucan_probe(struct usb_interface *intf, u8 in_ep_addr; u8 out_ep_addr; union ucan_ctl_payload *ctl_msg_buffer; - char firmware_str[sizeof(union ucan_ctl_payload) + 1]; udev = interface_to_usbdev(intf); @@ -1527,17 +1528,6 @@ static int ucan_probe(struct usb_interface *intf, */ ucan_parse_device_info(up, &ctl_msg_buffer->cmd_get_device_info); - /* just print some device information - if available */ - ret = ucan_device_request_in(up, UCAN_DEVICE_GET_FW_STRING, 0, - sizeof(union ucan_ctl_payload)); - if (ret > 0) { - /* copy string while ensuring zero termination */ - strscpy(firmware_str, up->ctl_msg_buffer->raw, - sizeof(union ucan_ctl_payload) + 1); - } else { - strcpy(firmware_str, "unknown"); - } - /* device is compatible, reset it */ ret = ucan_ctrl_command_out(up, UCAN_COMMAND_RESET, 0, 0); if (ret < 0) @@ -1555,7 +1545,10 @@ static int ucan_probe(struct usb_interface *intf, /* initialisation complete, log device info */ netdev_info(up->netdev, "registered device\n"); - netdev_info(up->netdev, "firmware string: %s\n", firmware_str); + ucan_get_fw_str(up, up->ctl_msg_buffer->fw_str, + sizeof(up->ctl_msg_buffer->fw_str)); + netdev_info(up->netdev, "firmware string: %s\n", + up->ctl_msg_buffer->fw_str); /* success */ return 0; From 80b5f90158d1364cbd80ad82852a757fc0692bf2 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 10 Mar 2025 15:33:53 +0100 Subject: [PATCH 07/33] can: statistics: use atomic access in hot path In can_send() and can_receive() CAN messages and CAN filter matches are counted to be visible in the CAN procfs files. KCSAN detected a data race within can_send() when two CAN frames have been generated by a timer event writing to the same CAN netdevice at the same time. Use atomic operations to access the statistics in the hot path to fix the KCSAN complaint. Reported-by: syzbot+78ce4489b812515d5e4d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67cd717d.050a0220.e1a89.0006.GAE@google.com Signed-off-by: Oliver Hartkopp Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20250310143353.3242-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 12 ++++++------ net/can/af_can.h | 12 ++++++------ net/can/proc.c | 46 +++++++++++++++++++++++++++------------------- 3 files changed, 39 insertions(+), 31 deletions(-) diff --git a/net/can/af_can.c b/net/can/af_can.c index 01f3fbb3b67dc..65230e81fa08c 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -287,8 +287,8 @@ int can_send(struct sk_buff *skb, int loop) netif_rx(newskb); /* update statistics */ - pkg_stats->tx_frames++; - pkg_stats->tx_frames_delta++; + atomic_long_inc(&pkg_stats->tx_frames); + atomic_long_inc(&pkg_stats->tx_frames_delta); return 0; @@ -647,8 +647,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) int matches; /* update statistics */ - pkg_stats->rx_frames++; - pkg_stats->rx_frames_delta++; + atomic_long_inc(&pkg_stats->rx_frames); + atomic_long_inc(&pkg_stats->rx_frames_delta); /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) @@ -669,8 +669,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); if (matches > 0) { - pkg_stats->matches++; - pkg_stats->matches_delta++; + atomic_long_inc(&pkg_stats->matches); + atomic_long_inc(&pkg_stats->matches_delta); } } diff --git a/net/can/af_can.h b/net/can/af_can.h index 7c2d9161e2245..22f3352c77fec 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -66,9 +66,9 @@ struct receiver { struct can_pkg_stats { unsigned long jiffies_init; - unsigned long rx_frames; - unsigned long tx_frames; - unsigned long matches; + atomic_long_t rx_frames; + atomic_long_t tx_frames; + atomic_long_t matches; unsigned long total_rx_rate; unsigned long total_tx_rate; @@ -82,9 +82,9 @@ struct can_pkg_stats { unsigned long max_tx_rate; unsigned long max_rx_match_ratio; - unsigned long rx_frames_delta; - unsigned long tx_frames_delta; - unsigned long matches_delta; + atomic_long_t rx_frames_delta; + atomic_long_t tx_frames_delta; + atomic_long_t matches_delta; }; /* persistent statistics */ diff --git a/net/can/proc.c b/net/can/proc.c index bbce97825f13f..25fdf060e30d0 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -118,6 +118,13 @@ void can_stat_update(struct timer_list *t) struct can_pkg_stats *pkg_stats = net->can.pkg_stats; unsigned long j = jiffies; /* snapshot */ + long rx_frames = atomic_long_read(&pkg_stats->rx_frames); + long tx_frames = atomic_long_read(&pkg_stats->tx_frames); + long matches = atomic_long_read(&pkg_stats->matches); + long rx_frames_delta = atomic_long_read(&pkg_stats->rx_frames_delta); + long tx_frames_delta = atomic_long_read(&pkg_stats->tx_frames_delta); + long matches_delta = atomic_long_read(&pkg_stats->matches_delta); + /* restart counting in timer context on user request */ if (user_reset) can_init_stats(net); @@ -127,35 +134,33 @@ void can_stat_update(struct timer_list *t) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (pkg_stats->rx_frames > (ULONG_MAX / HZ)) + if (rx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (pkg_stats->tx_frames > (ULONG_MAX / HZ)) + if (tx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* matches overflow - very improbable */ - if (pkg_stats->matches > (ULONG_MAX / 100)) + if (matches > (LONG_MAX / 100)) can_init_stats(net); /* calc total values */ - if (pkg_stats->rx_frames) - pkg_stats->total_rx_match_ratio = (pkg_stats->matches * 100) / - pkg_stats->rx_frames; + if (rx_frames) + pkg_stats->total_rx_match_ratio = (matches * 100) / rx_frames; pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j, - pkg_stats->tx_frames); + tx_frames); pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j, - pkg_stats->rx_frames); + rx_frames); /* calc current values */ - if (pkg_stats->rx_frames_delta) + if (rx_frames_delta) pkg_stats->current_rx_match_ratio = - (pkg_stats->matches_delta * 100) / - pkg_stats->rx_frames_delta; + (matches_delta * 100) / rx_frames_delta; - pkg_stats->current_tx_rate = calc_rate(0, HZ, pkg_stats->tx_frames_delta); - pkg_stats->current_rx_rate = calc_rate(0, HZ, pkg_stats->rx_frames_delta); + pkg_stats->current_tx_rate = calc_rate(0, HZ, tx_frames_delta); + pkg_stats->current_rx_rate = calc_rate(0, HZ, rx_frames_delta); /* check / update maximum values */ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate) @@ -168,9 +173,9 @@ void can_stat_update(struct timer_list *t) pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio; /* clear values for 'current rate' calculation */ - pkg_stats->tx_frames_delta = 0; - pkg_stats->rx_frames_delta = 0; - pkg_stats->matches_delta = 0; + atomic_long_set(&pkg_stats->tx_frames_delta, 0); + atomic_long_set(&pkg_stats->rx_frames_delta, 0); + atomic_long_set(&pkg_stats->matches_delta, 0); /* restart timer (one second) */ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); @@ -214,9 +219,12 @@ static int can_stats_proc_show(struct seq_file *m, void *v) struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; seq_putc(m, '\n'); - seq_printf(m, " %8ld transmitted frames (TXF)\n", pkg_stats->tx_frames); - seq_printf(m, " %8ld received frames (RXF)\n", pkg_stats->rx_frames); - seq_printf(m, " %8ld matched frames (RXMF)\n", pkg_stats->matches); + seq_printf(m, " %8ld transmitted frames (TXF)\n", + atomic_long_read(&pkg_stats->tx_frames)); + seq_printf(m, " %8ld received frames (RXF)\n", + atomic_long_read(&pkg_stats->rx_frames)); + seq_printf(m, " %8ld matched frames (RXMF)\n", + atomic_long_read(&pkg_stats->matches)); seq_putc(m, '\n'); From 51f6fc9eb1d77ae5cacc796fc043dedc1f0f0073 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 7 Mar 2025 17:03:26 +0000 Subject: [PATCH 08/33] dt-bindings: can: renesas,rcar-canfd: Fix typo in pattern properties for R-Car V4M The Renesas R-Car V4M(R8A779H0) SoC, supports up to four channels. Fix the typo 5->4 in pattern properties. Fixes: ced52c6ed257 ("dt-bindings: can: renesas,rcar-canfd: Document R-Car V4M support") Cc: stable@vger.kernel.org Reviewed-by: Geert Uytterhoeven Acked-by: "Rob Herring (Arm)" Signed-off-by: Biju Das Link: https://patch.msgid.link/20250307170330.173425-2-biju.das.jz@bp.renesas.com Signed-off-by: Marc Kleine-Budde --- .../devicetree/bindings/net/can/renesas,rcar-canfd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml b/Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml index 7c5ac5d2e880b..f6884f6e59e74 100644 --- a/Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml +++ b/Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml @@ -170,7 +170,7 @@ allOf: const: renesas,r8a779h0-canfd then: patternProperties: - "^channel[5-7]$": false + "^channel[4-7]$": false else: if: not: From 1dba0a37644ed3022558165bbb5cb9bda540eaf7 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 7 Mar 2025 17:03:27 +0000 Subject: [PATCH 09/33] can: rcar_canfd: Fix page entries in the AFL list There are a total of 96 AFL pages and each page has 16 entries with registers CFDGAFLIDr, CFDGAFLMr, CFDGAFLP0r, CFDGAFLP1r holding the rule entries (r = 0..15). Currently, RCANFD_GAFL* macros use a start variable to find AFL entries, which is incorrect as the testing on RZ/G3E shows ch1 and ch4 gets a start value of 0 and the register contents are overwritten. Fix this issue by using rule_entry corresponding to the channel to find the page entries in the AFL list. Fixes: dd3bd23eb438 ("can: rcar_canfd: Add Renesas R-Car CAN FD driver") Cc: stable@vger.kernel.org Signed-off-by: Biju Das Tested-by: Geert Uytterhoeven Link: https://patch.msgid.link/20250307170330.173425-3-biju.das.jz@bp.renesas.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rcar/rcar_canfd.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index df1a5d0b37b22..aa3df0d05b853 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -787,22 +787,14 @@ static void rcar_canfd_configure_controller(struct rcar_canfd_global *gpriv) } static void rcar_canfd_configure_afl_rules(struct rcar_canfd_global *gpriv, - u32 ch) + u32 ch, u32 rule_entry) { - u32 cfg; - int offset, start, page, num_rules = RCANFD_CHANNEL_NUMRULES; + int offset, page, num_rules = RCANFD_CHANNEL_NUMRULES; + u32 rule_entry_index = rule_entry % 16; u32 ridx = ch + RCANFD_RFFIFO_IDX; - if (ch == 0) { - start = 0; /* Channel 0 always starts from 0th rule */ - } else { - /* Get number of Channel 0 rules and adjust */ - cfg = rcar_canfd_read(gpriv->base, RCANFD_GAFLCFG(ch)); - start = RCANFD_GAFLCFG_GETRNC(gpriv, 0, cfg); - } - /* Enable write access to entry */ - page = RCANFD_GAFL_PAGENUM(start); + page = RCANFD_GAFL_PAGENUM(rule_entry); rcar_canfd_set_bit(gpriv->base, RCANFD_GAFLECTR, (RCANFD_GAFLECTR_AFLPN(gpriv, page) | RCANFD_GAFLECTR_AFLDAE)); @@ -818,13 +810,13 @@ static void rcar_canfd_configure_afl_rules(struct rcar_canfd_global *gpriv, offset = RCANFD_C_GAFL_OFFSET; /* Accept all IDs */ - rcar_canfd_write(gpriv->base, RCANFD_GAFLID(offset, start), 0); + rcar_canfd_write(gpriv->base, RCANFD_GAFLID(offset, rule_entry_index), 0); /* IDE or RTR is not considered for matching */ - rcar_canfd_write(gpriv->base, RCANFD_GAFLM(offset, start), 0); + rcar_canfd_write(gpriv->base, RCANFD_GAFLM(offset, rule_entry_index), 0); /* Any data length accepted */ - rcar_canfd_write(gpriv->base, RCANFD_GAFLP0(offset, start), 0); + rcar_canfd_write(gpriv->base, RCANFD_GAFLP0(offset, rule_entry_index), 0); /* Place the msg in corresponding Rx FIFO entry */ - rcar_canfd_set_bit(gpriv->base, RCANFD_GAFLP1(offset, start), + rcar_canfd_set_bit(gpriv->base, RCANFD_GAFLP1(offset, rule_entry_index), RCANFD_GAFLP1_GAFLFDP(ridx)); /* Disable write access to page */ @@ -1851,6 +1843,7 @@ static int rcar_canfd_probe(struct platform_device *pdev) unsigned long channels_mask = 0; int err, ch_irq, g_irq; int g_err_irq, g_recc_irq; + u32 rule_entry = 0; bool fdmode = true; /* CAN FD only mode - default */ char name[9] = "channelX"; int i; @@ -2023,7 +2016,8 @@ static int rcar_canfd_probe(struct platform_device *pdev) rcar_canfd_configure_tx(gpriv, ch); /* Configure receive rules */ - rcar_canfd_configure_afl_rules(gpriv, ch); + rcar_canfd_configure_afl_rules(gpriv, ch, rule_entry); + rule_entry += RCANFD_CHANNEL_NUMRULES; } /* Configure common interrupts */ From fd99d6ed20234b83d65b9c5417794343577cf3e5 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Fri, 14 Mar 2025 19:01:44 +0800 Subject: [PATCH 10/33] can: flexcan: only change CAN state when link up in system PM After a suspend/resume cycle on a down interface, it will come up as ERROR-ACTIVE. $ ip -details -s -s a s dev flexcan0 3: flexcan0: mtu 16 qdisc pfifo_fast state DOWN group default qlen 10 link/can promiscuity 0 allmulti 0 minmtu 0 maxmtu 0 can state STOPPED (berr-counter tx 0 rx 0) restart-ms 1000 $ sudo systemctl suspend $ ip -details -s -s a s dev flexcan0 3: flexcan0: mtu 16 qdisc pfifo_fast state DOWN group default qlen 10 link/can promiscuity 0 allmulti 0 minmtu 0 maxmtu 0 can state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 1000 And only set CAN state to CAN_STATE_ERROR_ACTIVE when resume process has no issue, otherwise keep in CAN_STATE_SLEEPING as suspend did. Fixes: 4de349e786a3 ("can: flexcan: fix resume function") Cc: stable@vger.kernel.org Signed-off-by: Haibo Chen Link: https://patch.msgid.link/20250314110145.899179-1-haibo.chen@nxp.com Reported-by: Marc Kleine-Budde Closes: https://lore.kernel.org/all/20250314-married-polar-elephant-b15594-mkl@pengutronix.de [mkl: add newlines] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan/flexcan-core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c index ac1a860986df6..3a71fd2357222 100644 --- a/drivers/net/can/flexcan/flexcan-core.c +++ b/drivers/net/can/flexcan/flexcan-core.c @@ -2266,8 +2266,9 @@ static int __maybe_unused flexcan_suspend(struct device *device) } netif_stop_queue(dev); netif_device_detach(dev); + + priv->can.state = CAN_STATE_SLEEPING; } - priv->can.state = CAN_STATE_SLEEPING; return 0; } @@ -2278,7 +2279,6 @@ static int __maybe_unused flexcan_resume(struct device *device) struct flexcan_priv *priv = netdev_priv(dev); int err; - priv->can.state = CAN_STATE_ERROR_ACTIVE; if (netif_running(dev)) { netif_device_attach(dev); netif_start_queue(dev); @@ -2298,6 +2298,8 @@ static int __maybe_unused flexcan_resume(struct device *device) flexcan_chip_interrupts_enable(dev); } + + priv->can.state = CAN_STATE_ERROR_ACTIVE; } return 0; From 5a19143124be42900b3fbc9ada3c919632eb45eb Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Fri, 14 Mar 2025 19:01:45 +0800 Subject: [PATCH 11/33] can: flexcan: disable transceiver during system PM During system PM, if no wakeup requirement, disable transceiver to save power. Fixes: 4de349e786a3 ("can: flexcan: fix resume function") Cc: stable@vger.kernel.org Reviewed-by: Frank Li Signed-off-by: Haibo Chen Link: https://patch.msgid.link/20250314110145.899179-2-haibo.chen@nxp.com [mkl: add newlines] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan/flexcan-core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c index 3a71fd2357222..b080740bcb104 100644 --- a/drivers/net/can/flexcan/flexcan-core.c +++ b/drivers/net/can/flexcan/flexcan-core.c @@ -2260,6 +2260,10 @@ static int __maybe_unused flexcan_suspend(struct device *device) flexcan_chip_interrupts_disable(dev); + err = flexcan_transceiver_disable(priv); + if (err) + return err; + err = pinctrl_pm_select_sleep_state(device); if (err) return err; @@ -2292,10 +2296,16 @@ static int __maybe_unused flexcan_resume(struct device *device) if (err) return err; - err = flexcan_chip_start(dev); + err = flexcan_transceiver_enable(priv); if (err) return err; + err = flexcan_chip_start(dev); + if (err) { + flexcan_transceiver_disable(priv); + return err; + } + flexcan_chip_interrupts_enable(dev); } From 5f079290e5913a0060e059500b7d440990ac1066 Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Tue, 11 Mar 2025 21:12:59 +0530 Subject: [PATCH 12/33] net: ethernet: ti: am65-cpsw: Fix NAPI registration sequence Registering the interrupts for TX or RX DMA Channels prior to registering their respective NAPI callbacks can result in a NULL pointer dereference. This is seen in practice as a random occurrence since it depends on the randomness associated with the generation of traffic by Linux and the reception of traffic from the wire. Fixes: 681eb2beb3ef ("net: ethernet: ti: am65-cpsw: ensure proper channel cleanup in error path") Signed-off-by: Vignesh Raghavendra Co-developed-by: Siddharth Vadapalli Signed-off-by: Siddharth Vadapalli Reviewed-by: Alexander Sverdlin Reviewed-by: Roger Quadros Link: https://patch.msgid.link/20250311154259.102865-1-s-vadapalli@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 32 +++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2806238629f82..bef734c6e5c2b 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2306,14 +2306,18 @@ static void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common) static int am65_cpsw_nuss_ndev_add_tx_napi(struct am65_cpsw_common *common) { struct device *dev = common->dev; + struct am65_cpsw_tx_chn *tx_chn; int i, ret = 0; for (i = 0; i < common->tx_ch_num; i++) { - struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i]; + tx_chn = &common->tx_chns[i]; hrtimer_init(&tx_chn->tx_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); tx_chn->tx_hrtimer.function = &am65_cpsw_nuss_tx_timer_callback; + netif_napi_add_tx(common->dma_ndev, &tx_chn->napi_tx, + am65_cpsw_nuss_tx_poll); + ret = devm_request_irq(dev, tx_chn->irq, am65_cpsw_nuss_tx_irq, IRQF_TRIGGER_HIGH, @@ -2323,19 +2327,16 @@ static int am65_cpsw_nuss_ndev_add_tx_napi(struct am65_cpsw_common *common) tx_chn->id, tx_chn->irq, ret); goto err; } - - netif_napi_add_tx(common->dma_ndev, &tx_chn->napi_tx, - am65_cpsw_nuss_tx_poll); } return 0; err: - for (--i ; i >= 0 ; i--) { - struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i]; - - netif_napi_del(&tx_chn->napi_tx); + netif_napi_del(&tx_chn->napi_tx); + for (--i; i >= 0; i--) { + tx_chn = &common->tx_chns[i]; devm_free_irq(dev, tx_chn->irq, tx_chn); + netif_napi_del(&tx_chn->napi_tx); } return ret; @@ -2569,6 +2570,9 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) HRTIMER_MODE_REL_PINNED); flow->rx_hrtimer.function = &am65_cpsw_nuss_rx_timer_callback; + netif_napi_add(common->dma_ndev, &flow->napi_rx, + am65_cpsw_nuss_rx_poll); + ret = devm_request_irq(dev, flow->irq, am65_cpsw_nuss_rx_irq, IRQF_TRIGGER_HIGH, @@ -2577,11 +2581,8 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) dev_err(dev, "failure requesting rx %d irq %u, %d\n", i, flow->irq, ret); flow->irq = -EINVAL; - goto err_flow; + goto err_request_irq; } - - netif_napi_add(common->dma_ndev, &flow->napi_rx, - am65_cpsw_nuss_rx_poll); } /* setup classifier to route priorities to flows */ @@ -2589,11 +2590,14 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) return 0; +err_request_irq: + netif_napi_del(&flow->napi_rx); + err_flow: - for (--i; i >= 0 ; i--) { + for (--i; i >= 0; i--) { flow = &rx_chn->flows[i]; - netif_napi_del(&flow->napi_rx); devm_free_irq(dev, flow->irq, flow); + netif_napi_del(&flow->napi_rx); } err: From 2fc8a346625eb1abfe202062c7e6a13d76cde5ea Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 11 Mar 2025 13:12:54 -0700 Subject: [PATCH 13/33] net: mana: Support holes in device list reply msg According to GDMA protocol, holes (zeros) are allowed at the beginning or middle of the gdma_list_devices_resp message. The existing code cannot properly handle this, and may miss some devices in the list. To fix, scan the entire list until the num_of_devs are found, or until the end of the list. Cc: stable@vger.kernel.org Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Haiyang Zhang Reviewed-by: Long Li Reviewed-by: Shradha Gupta Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/1741723974-1534-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 14 ++++++++++---- include/net/mana/gdma.h | 11 +++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 11457b6296cc0..638ef64d639f3 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -134,9 +134,10 @@ static int mana_gd_detect_devices(struct pci_dev *pdev) struct gdma_list_devices_resp resp = {}; struct gdma_general_req req = {}; struct gdma_dev_id dev; - u32 i, max_num_devs; + int found_dev = 0; u16 dev_type; int err; + u32 i; mana_gd_init_req_hdr(&req.hdr, GDMA_LIST_DEVICES, sizeof(req), sizeof(resp)); @@ -148,12 +149,17 @@ static int mana_gd_detect_devices(struct pci_dev *pdev) return err ? err : -EPROTO; } - max_num_devs = min_t(u32, MAX_NUM_GDMA_DEVICES, resp.num_of_devs); - - for (i = 0; i < max_num_devs; i++) { + for (i = 0; i < GDMA_DEV_LIST_SIZE && + found_dev < resp.num_of_devs; i++) { dev = resp.devs[i]; dev_type = dev.type; + /* Skip empty devices */ + if (dev.as_uint32 == 0) + continue; + + found_dev++; + /* HWC is already detected in mana_hwc_create_channel(). */ if (dev_type == GDMA_DEVICE_HWC) continue; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 90f56656b5724..62e9d76738628 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -408,8 +408,6 @@ struct gdma_context { struct gdma_dev mana_ib; }; -#define MAX_NUM_GDMA_DEVICES 4 - static inline bool mana_gd_is_mana(struct gdma_dev *gd) { return gd->dev_id.type == GDMA_DEVICE_MANA; @@ -556,11 +554,15 @@ enum { #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5) +/* Driver can handle holes (zeros) in the device list */ +#define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ - GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT) + GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) #define GDMA_DRV_CAP_FLAGS2 0 @@ -621,11 +623,12 @@ struct gdma_query_max_resources_resp { }; /* HW DATA */ /* GDMA_LIST_DEVICES */ +#define GDMA_DEV_LIST_SIZE 64 struct gdma_list_devices_resp { struct gdma_resp_hdr hdr; u32 num_of_devs; u32 reserved; - struct gdma_dev_id devs[64]; + struct gdma_dev_id devs[GDMA_DEV_LIST_SIZE]; }; /* HW DATA */ /* GDMA_REGISTER_DEVICE */ From daa624d3c2ddffdcbad140a9625a4064371db44f Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 11 Mar 2025 22:25:30 +0100 Subject: [PATCH 14/33] net: ipv6: fix TCP GSO segmentation with NAT When updating the source/destination address, the TCP/UDP checksum needs to be updated as well. Fixes: bee88cd5bd83 ("net: add support for segmenting TCP fraglist GSO packets") Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/20250311212530.91519-1-nbd@nbd.name Signed-off-by: Paolo Abeni --- net/ipv6/tcpv6_offload.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a45bf17cb2a17..ae2da28f9dfb1 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -94,14 +94,23 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) } static void __tcpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, __be16 *oldport, __be16 newport) { - struct tcphdr *th; + struct tcphdr *th = tcp_hdr(seg); + + if (!ipv6_addr_equal(oldip, newip)) { + inet_proto_csum_replace16(&th->check, seg, + oldip->s6_addr32, + newip->s6_addr32, + true); + *oldip = *newip; + } if (*oldport == newport) return; - th = tcp_hdr(seg); inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); *oldport = newport; } @@ -129,10 +138,10 @@ static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) th2 = tcp_hdr(seg); iph2 = ipv6_hdr(seg); - iph2->saddr = iph->saddr; - iph2->daddr = iph->daddr; - __tcpv6_gso_segment_csum(seg, &th2->source, th->source); - __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest); + __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &th2->source, th->source); + __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &th2->dest, th->dest); } return segs; From 9740890ee20e01f99ff1dde84c63dcf089fabb98 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 11 Mar 2025 18:03:25 -0700 Subject: [PATCH 15/33] ipv6: Fix memleak of nhc_pcpu_rth_output in fib_check_nh_v6_gw(). fib_check_nh_v6_gw() expects that fib6_nh_init() cleans up everything when it fails. Commit 7dd73168e273 ("ipv6: Always allocate pcpu memory in a fib6_nh") moved fib_nh_common_init() before alloc_percpu_gfp() within fib6_nh_init() but forgot to add cleanup for fib6_nh->nh_common.nhc_pcpu_rth_output in case it fails to allocate fib6_nh->rt6i_pcpu, resulting in memleak. Let's call fib_nh_common_release() and clear nhc_pcpu_rth_output in the error path. Note that we can remove the fib6_nh_release() call in nh_create_ipv6() later in net-next.git. Fixes: 7dd73168e273 ("ipv6: Always allocate pcpu memory in a fib6_nh") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250312010333.56001-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ef2d23a1e3d53..bc6bcf5d7133e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3644,7 +3644,8 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, in6_dev_put(idev); if (err) { - lwtstate_put(fib6_nh->fib_nh_lws); + fib_nh_common_release(&fib6_nh->nh_common); + fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } From 9a81fc3480bf5dbe2bf80e278c440770f6ba2692 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 11 Mar 2025 18:38:48 -0700 Subject: [PATCH 16/33] ipv6: Set errno after ip_fib_metrics_init() in ip6_route_info_create(). While creating a new IPv6, we could get a weird -ENOMEM when RTA_NH_ID is set and either of the conditions below is true: 1) CONFIG_IPV6_SUBTREES is enabled and rtm_src_len is specified 2) nexthop_get() fails e.g.) # strace ip -6 route add fe80::dead:beef:dead:beef nhid 1 from :: recvmsg(3, {msg_iov=[{iov_base=[...[ {error=-ENOMEM, msg=[... [...]]}, [{nla_len=49, nla_type=NLMSGERR_ATTR_MSG}, "Nexthops can not be used with so"...] ]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 148 Let's set err explicitly after ip_fib_metrics_init() in ip6_route_info_create(). Fixes: f88d8ea67fbd ("ipv6: Plumb support for nexthop object in a fib6_info") Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20250312013854.61125-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bc6bcf5d7133e..15ce21afc8c62 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3803,10 +3803,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (nh) { if (rt->fib6_src.plen) { NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + err = -EINVAL; goto out_free; } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -ENOENT; goto out_free; } rt->nh = nh; From f3b97b7d4bf316c3991e5634c9f4847c2df35478 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Wed, 12 Mar 2025 10:52:49 +0100 Subject: [PATCH 17/33] devlink: fix xa_alloc_cyclic() error handling In case of returning 1 from xa_alloc_cyclic() (wrapping) ERR_PTR(1) will be returned, which will cause IS_ERR() to be false. Which can lead to dereference not allocated pointer (rel). Fix it by checking if err is lower than zero. This wasn't found in real usecase, only noticed. Credit to Pierre. Fixes: c137743bce02 ("devlink: introduce object and nested devlink relationship infra") Signed-off-by: Michal Swiatkowski Reviewed-by: Andrew Lunn Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/devlink/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/devlink/core.c b/net/devlink/core.c index f49cd83f1955f..7203c39532fcc 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -117,7 +117,7 @@ static struct devlink_rel *devlink_rel_alloc(void) err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel, xa_limit_32b, &next, GFP_KERNEL); - if (err) { + if (err < 0) { kfree(rel); return ERR_PTR(err); } From 3614bf90130d60f191a5fe218d04f6251c678e13 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Wed, 12 Mar 2025 10:52:50 +0100 Subject: [PATCH 18/33] dpll: fix xa_alloc_cyclic() error handling In case of returning 1 from xa_alloc_cyclic() (wrapping) ERR_PTR(1) will be returned, which will cause IS_ERR() to be false. Which can lead to dereference not allocated pointer (pin). Fix it by checking if err is lower than zero. This wasn't found in real usecase, only noticed. Credit to Pierre. Fixes: 97f265ef7f5b ("dpll: allocate pin ids in cycle") Signed-off-by: Michal Swiatkowski Reviewed-by: Vadim Fedorenko Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/dpll/dpll_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index 32019dc33cca7..1877201d1aa9f 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -505,7 +505,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC); ret = xa_alloc_cyclic(&dpll_pin_xa, &pin->id, pin, xa_limit_32b, &dpll_pin_xa_id, GFP_KERNEL); - if (ret) + if (ret < 0) goto err_xa_alloc; return pin; err_xa_alloc: From 3178d2b048365fe2c078cd53f85f2abf1487733b Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Wed, 12 Mar 2025 10:52:51 +0100 Subject: [PATCH 19/33] phy: fix xa_alloc_cyclic() error handling xa_alloc_cyclic() can return 1, which isn't an error. To prevent situation when the caller of this function will treat it as no error do a check only for negative here. Fixes: 384968786909 ("net: phy: Introduce ethernet link topology representation") Signed-off-by: Michal Swiatkowski Reviewed-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/phy/phy_link_topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_link_topology.c b/drivers/net/phy/phy_link_topology.c index 4a5d73002a1a8..0e9e987f37dd8 100644 --- a/drivers/net/phy/phy_link_topology.c +++ b/drivers/net/phy/phy_link_topology.c @@ -73,7 +73,7 @@ int phy_link_topo_add_phy(struct net_device *dev, xa_limit_32b, &topo->next_phy_index, GFP_KERNEL); - if (ret) + if (ret < 0) goto err; return 0; From acf10a8c0b3a36d5ff35f91585e9755ea0b62ac3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Mar 2025 14:10:40 +0100 Subject: [PATCH 20/33] selftests: drv-net: use defer in the ping test Make sure the test cleans up after itself. The XDP off statements at the end of the test may not be reached. Fixes: 75cc19c8ff89 ("selftests: drv-net: add xdp cases for ping.py") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250312131040.660386-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/ping.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index 93f4b411b378f..fc69bfcc37c46 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -7,7 +7,7 @@ from lib.py import ksft_eq, KsftSkipEx, KsftFailEx from lib.py import EthtoolFamily, NetDrvEpEnv from lib.py import bkg, cmd, wait_port_listen, rand_port -from lib.py import ethtool, ip +from lib.py import defer, ethtool, ip remote_ifname="" no_sleep=False @@ -60,6 +60,7 @@ def _set_xdp_generic_sb_on(cfg) -> None: prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) cmd(f"ip link set dev {cfg.ifname} mtu 1500 xdpgeneric obj {prog} sec xdp", shell=True) + defer(cmd, f"ip link set dev {cfg.ifname} xdpgeneric off") if no_sleep != True: time.sleep(10) @@ -68,7 +69,9 @@ def _set_xdp_generic_mb_on(cfg) -> None: test_dir = os.path.dirname(os.path.realpath(__file__)) prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" cmd(f"ip link set dev {remote_ifname} mtu 9000", shell=True, host=cfg.remote) + defer(ip, f"link set dev {remote_ifname} mtu 1500", host=cfg.remote) ip("link set dev %s mtu 9000 xdpgeneric obj %s sec xdp.frags" % (cfg.ifname, prog)) + defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdpgeneric off") if no_sleep != True: time.sleep(10) @@ -78,6 +81,7 @@ def _set_xdp_native_sb_on(cfg) -> None: prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) cmd(f"ip -j link set dev {cfg.ifname} mtu 1500 xdp obj {prog} sec xdp", shell=True) + defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdp off") xdp_info = ip("-d link show %s" % (cfg.ifname), json=True)[0] if xdp_info['xdp']['mode'] != 1: """ @@ -94,10 +98,11 @@ def _set_xdp_native_mb_on(cfg) -> None: test_dir = os.path.dirname(os.path.realpath(__file__)) prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" cmd(f"ip link set dev {remote_ifname} mtu 9000", shell=True, host=cfg.remote) + defer(ip, f"link set dev {remote_ifname} mtu 1500", host=cfg.remote) try: cmd(f"ip link set dev {cfg.ifname} mtu 9000 xdp obj {prog} sec xdp.frags", shell=True) + defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdp off") except Exception as e: - cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) raise KsftSkipEx('device does not support native-multi-buffer XDP') if no_sleep != True: @@ -111,6 +116,7 @@ def _set_xdp_offload_on(cfg) -> None: cmd(f"ip link set dev {cfg.ifname} xdpoffload obj {prog} sec xdp", shell=True) except Exception as e: raise KsftSkipEx('device does not support offloaded XDP') + defer(ip, f"link set dev {cfg.ifname} xdpoffload off") cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) if no_sleep != True: @@ -157,7 +163,6 @@ def test_xdp_generic_sb(cfg, netnl) -> None: _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdpgeneric off" % cfg.ifname) def test_xdp_generic_mb(cfg, netnl) -> None: _set_xdp_generic_mb_on(cfg) @@ -169,7 +174,6 @@ def test_xdp_generic_mb(cfg, netnl) -> None: _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdpgeneric off" % cfg.ifname) def test_xdp_native_sb(cfg, netnl) -> None: _set_xdp_native_sb_on(cfg) @@ -181,7 +185,6 @@ def test_xdp_native_sb(cfg, netnl) -> None: _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdp off" % cfg.ifname) def test_xdp_native_mb(cfg, netnl) -> None: _set_xdp_native_mb_on(cfg) @@ -193,14 +196,12 @@ def test_xdp_native_mb(cfg, netnl) -> None: _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdp off" % cfg.ifname) def test_xdp_offload(cfg, netnl) -> None: _set_xdp_offload_on(cfg) _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdpoffload off" % cfg.ifname) def main() -> None: with NetDrvEpEnv(__file__) as cfg: @@ -213,7 +214,6 @@ def main() -> None: test_xdp_native_mb, test_xdp_offload], args=(cfg, EthtoolFamily())) - set_interface_init(cfg) ksft_exit() From c9cb135bc604ad6e457e06c19e0857c9cd0eef7f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 12 Mar 2025 19:43:09 +0000 Subject: [PATCH 21/33] net: stmmac: dwc-qos-eth: use devm_kzalloc() for AXI data Everywhere else in the driver uses devm_kzalloc() when allocating the AXI data, so there is no kfree() of this structure. However, dwc-qos-eth uses kzalloc(), which leads to this memory being leaked. Switch to use devm_kzalloc(). Fixes: d8256121a91a ("stmmac: adding new glue driver dwmac-dwc-qos-eth") Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tsRyv-0064nU-O9@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index bd4eb187f8c64..b5a7e05ab7a7e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -46,7 +46,9 @@ static int dwc_eth_dwmac_config_dt(struct platform_device *pdev, u32 a_index = 0; if (!plat_dat->axi) { - plat_dat->axi = kzalloc(sizeof(struct stmmac_axi), GFP_KERNEL); + plat_dat->axi = devm_kzalloc(&pdev->dev, + sizeof(struct stmmac_axi), + GFP_KERNEL); if (!plat_dat->axi) return -ENOMEM; From 559847f56769037e5b2e0474d3dbff985b98083d Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Thu, 13 Mar 2025 08:50:08 +0000 Subject: [PATCH 22/33] xsk: fix an integer overflow in xp_create_and_assign_umem() Since the i and pool->chunk_size variables are of type 'u32', their product can wrap around and then be cast to 'u64'. This can lead to two different XDP buffers pointing to the same memory area. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 94033cd8e73b ("xsk: Optimize for aligned case") Cc: stable@vger.kernel.org Signed-off-by: Ilia Gavrilov Link: https://patch.msgid.link/20250313085007.3116044-1-Ilia.Gavrilov@infotecs.ru Signed-off-by: Paolo Abeni --- net/xdp/xsk_buff_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 1f7975b496579..d158cb6dd3919 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -105,7 +105,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, if (pool->unaligned) pool->free_heads[i] = xskb; else - xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); + xp_init_xskb_addr(xskb, pool, (u64)i * pool->chunk_size); } return pool; From f3009d0d6ab78053117f8857b921a8237f4d17b3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 14 Mar 2025 13:10:57 +0300 Subject: [PATCH 23/33] net: atm: fix use after free in lec_send() The ->send() operation frees skb so save the length before calling ->send() to avoid a use after free. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/c751531d-4af4-42fe-affe-6104b34b791d@stanley.mountain Signed-off-by: Paolo Abeni --- net/atm/lec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/atm/lec.c b/net/atm/lec.c index ffef658862db1..a948dd47c3f34 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -181,6 +181,7 @@ static void lec_send(struct atm_vcc *vcc, struct sk_buff *skb) { struct net_device *dev = skb->dev; + unsigned int len = skb->len; ATM_SKB(skb)->vcc = vcc; atm_account_tx(vcc, skb); @@ -191,7 +192,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) } dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + dev->stats.tx_bytes += len; } static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) From 47a9b5e52abd2b717dfc8b9460589f89936d93cf Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Fri, 14 Mar 2025 15:57:21 +0530 Subject: [PATCH 24/33] net: ti: icssg-prueth: Add lock to stats Currently the API emac_update_hardware_stats() reads different ICSSG stats without any lock protection. This API gets called by .ndo_get_stats64() which is only under RCU protection and nothing else. Add lock to this API so that the reading of statistics happens during lock. Fixes: c1e10d5dc7a1 ("net: ti: icssg-prueth: Add ICSSG Stats") Signed-off-by: MD Danish Anwar Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250314102721.1394366-1-danishanwar@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 1 + drivers/net/ethernet/ti/icssg/icssg_prueth.h | 2 ++ drivers/net/ethernet/ti/icssg/icssg_stats.c | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 00ed978605478..9a75733e3f8fb 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -1679,6 +1679,7 @@ static int prueth_probe(struct platform_device *pdev) } spin_lock_init(&prueth->vtbl_lock); + spin_lock_init(&prueth->stats_lock); /* setup netdev interfaces */ if (eth0_node) { ret = prueth_netdev_init(prueth, eth0_node); diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index 329b46e9ee530..f41786b05741f 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -305,6 +305,8 @@ struct prueth { int default_vlan; /** @vtbl_lock: Lock for vtbl in shared memory */ spinlock_t vtbl_lock; + /** @stats_lock: Lock for reading icssg stats */ + spinlock_t stats_lock; }; struct emac_tx_ts_response { diff --git a/drivers/net/ethernet/ti/icssg/icssg_stats.c b/drivers/net/ethernet/ti/icssg/icssg_stats.c index 8800bd3a8d074..6f0edae38ea24 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_stats.c +++ b/drivers/net/ethernet/ti/icssg/icssg_stats.c @@ -26,6 +26,8 @@ void emac_update_hardware_stats(struct prueth_emac *emac) u32 val, reg; int i; + spin_lock(&prueth->stats_lock); + for (i = 0; i < ARRAY_SIZE(icssg_all_miig_stats); i++) { regmap_read(prueth->miig_rt, base + icssg_all_miig_stats[i].offset, @@ -51,6 +53,8 @@ void emac_update_hardware_stats(struct prueth_emac *emac) emac->pa_stats[i] += val; } } + + spin_unlock(&prueth->stats_lock); } void icssg_stats_work_handler(struct work_struct *work) From 986ffb3a57c5650fb8bf6d59a8f0f07046abfeb6 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Fri, 14 Mar 2025 13:00:46 +0100 Subject: [PATCH 25/33] net: lwtunnel: fix recursion loops This patch acts as a parachute, catch all solution, by detecting recursion loops in lwtunnel users and taking care of them (e.g., a loop between routes, a loop within the same route, etc). In general, such loops are the consequence of pathological configurations. Each lwtunnel user is still free to catch such loops early and do whatever they want with them. It will be the case in a separate patch for, e.g., seg6 and seg6_local, in order to provide drop reasons and update statistics. Another example of a lwtunnel user taking care of loops is ioam6, which has valid use cases that include loops (e.g., inline mode), and which is addressed by the next patch in this series. Overall, this patch acts as a last resort to catch loops and drop packets, since we don't want to leak something unintentionally because of a pathological configuration in lwtunnels. The solution in this patch reuses dev_xmit_recursion(), dev_xmit_recursion_inc(), and dev_xmit_recursion_dec(), which seems fine considering the context. Closes: https://lore.kernel.org/netdev/2bc9e2079e864a9290561894d2a602d6@akamai.com/ Closes: https://lore.kernel.org/netdev/Z7NKYMY7fJT5cYWu@shredder/ Fixes: ffce41962ef6 ("lwtunnel: support dst output redirect function") Fixes: 2536862311d2 ("lwt: Add support to redirect dst.input") Fixes: 14972cbd34ff ("net: lwtunnel: Handle fragmentation") Signed-off-by: Justin Iurman Link: https://patch.msgid.link/20250314120048.12569-2-justin.iurman@uliege.be Signed-off-by: Paolo Abeni --- net/core/lwtunnel.c | 65 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 711cd3b4347a7..4417a18b3e951 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -23,6 +23,8 @@ #include #include +#include "dev.h" + DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); @@ -325,13 +327,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; + goto drop; + } - if (!dst) + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || @@ -341,8 +353,11 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->output)) + if (likely(ops && ops->output)) { + dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) @@ -359,13 +374,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; + goto drop; + } - if (!dst) + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; goto drop; + } lwtstate = dst->lwtstate; @@ -376,8 +401,11 @@ int lwtunnel_xmit(struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->xmit)) + if (likely(ops && ops->xmit)) { + dev_xmit_recursion_inc(); ret = ops->xmit(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) @@ -394,13 +422,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || @@ -410,8 +448,11 @@ int lwtunnel_input(struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->input)) + if (likely(ops && ops->input)) { + dev_xmit_recursion_inc(); ret = ops->input(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) From 3e7a60b368eadf6c30a4a79dea1eb8f88b6d620d Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Fri, 14 Mar 2025 13:00:47 +0100 Subject: [PATCH 26/33] net: ipv6: ioam6: fix lwtunnel_output() loop Fix the lwtunnel_output() reentry loop in ioam6_iptunnel when the destination is the same after transformation. Note that a check on the destination address was already performed, but it was not enough. This is the example of a lwtunnel user taking care of loops without relying only on the last resort detection offered by lwtunnel. Fixes: 8cb3bf8bff3c ("ipv6: ioam: Add support for the ip6ip6 encapsulation") Signed-off-by: Justin Iurman Link: https://patch.msgid.link/20250314120048.12569-3-justin.iurman@uliege.be Signed-off-by: Paolo Abeni --- net/ipv6/ioam6_iptunnel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 2c383c12a4315..09065187378e6 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -337,7 +337,6 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; - struct in6_addr orig_daddr; struct ioam6_lwt *ilwt; int err = -EINVAL; u32 pkt_cnt; @@ -352,8 +351,6 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k) goto out; - orig_daddr = ipv6_hdr(skb)->daddr; - local_bh_disable(); cache_dst = dst_cache_get(&ilwt->cache); local_bh_enable(); @@ -422,7 +419,10 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { + /* avoid lwtunnel_output() reentry loop when destination is the same + * after transformation (e.g., with the inline mode) + */ + if (dst->lwtstate != cache_dst->lwtstate) { skb_dst_drop(skb); skb_dst_set(skb, cache_dst); return dst_output(net, sk, skb); From 3ed61b8938c66680e13a1d1929afb9b145c26a86 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Fri, 14 Mar 2025 13:00:48 +0100 Subject: [PATCH 27/33] selftests: net: test for lwtunnel dst ref loops As recently specified by commit 0ea09cbf8350 ("docs: netdev: add a note on selftest posting") in net-next, the selftest is therefore shipped in this series. However, this selftest does not really test this series. It needs this series to avoid crashing the kernel. What it really tests, thanks to kmemleak, is what was fixed by the following commits: - commit c71a192976de ("net: ipv6: fix dst refleaks in rpl, seg6 and ioam6 lwtunnels") - commit 92191dd10730 ("net: ipv6: fix dst ref loops in rpl, seg6 and ioam6 lwtunnels") - commit c64a0727f9b1 ("net: ipv6: fix dst ref loop on input in seg6 lwt") - commit 13e55fbaec17 ("net: ipv6: fix dst ref loop on input in rpl lwt") - commit 0e7633d7b95b ("net: ipv6: fix dst ref loop in ila lwtunnel") - commit 5da15a9c11c1 ("net: ipv6: fix missing dst ref drop in ila lwtunnel") Signed-off-by: Justin Iurman Link: https://patch.msgid.link/20250314120048.12569-4-justin.iurman@uliege.be Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 2 + .../selftests/net/lwt_dst_cache_ref_loop.sh | 246 ++++++++++++++++++ 3 files changed, 249 insertions(+) create mode 100755 tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 5916f3b81c39f..843ab747645d0 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -101,6 +101,7 @@ TEST_PROGS += vlan_bridge_binding.sh TEST_PROGS += bpf_offload.py TEST_PROGS += ipv6_route_update_soft_lockup.sh TEST_PROGS += busy_poll_test.sh +TEST_PROGS += lwt_dst_cache_ref_loop.sh # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller netlink-dumps diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 5b9baf7089506..61e5116987f3e 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -107,3 +107,5 @@ CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IPV6_ILA=m +CONFIG_IPV6_RPL_LWTUNNEL=y diff --git a/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh b/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh new file mode 100755 index 0000000000000..881eb399798f7 --- /dev/null +++ b/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh @@ -0,0 +1,246 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Author: Justin Iurman +# +# WARNING +# ------- +# This is just a dummy script that triggers encap cases with possible dst cache +# reference loops in affected lwt users (see list below). Some cases are +# pathological configurations for simplicity, others are valid. Overall, we +# don't want this issue to happen, no matter what. In order to catch any +# reference loops, kmemleak MUST be used. The results alone are always blindly +# successful, don't rely on them. Note that the following tests may crash the +# kernel if the fix to prevent lwtunnel_{input|output|xmit}() reentry loops is +# not present. +# +# Affected lwt users so far (please update accordingly if needed): +# - ila_lwt (output only) +# - ioam6_iptunnel (output only) +# - rpl_iptunnel (both input and output) +# - seg6_iptunnel (both input and output) + +source lib.sh + +check_compatibility() +{ + setup_ns tmp_node &>/dev/null + if [ $? != 0 ]; then + echo "SKIP: Cannot create netns." + exit $ksft_skip + fi + + ip link add name veth0 netns $tmp_node type veth \ + peer name veth1 netns $tmp_node &>/dev/null + local ret=$? + + ip -netns $tmp_node link set veth0 up &>/dev/null + ret=$((ret + $?)) + + ip -netns $tmp_node link set veth1 up &>/dev/null + ret=$((ret + $?)) + + if [ $ret != 0 ]; then + echo "SKIP: Cannot configure links." + cleanup_ns $tmp_node + exit $ksft_skip + fi + + lsmod 2>/dev/null | grep -q "ila" + ila_lsmod=$? + [ $ila_lsmod != 0 ] && modprobe ila &>/dev/null + + ip -netns $tmp_node route add 2001:db8:1::/64 \ + encap ila 1:2:3:4 csum-mode no-action ident-type luid \ + hook-type output \ + dev veth0 &>/dev/null + + ip -netns $tmp_node route add 2001:db8:2::/64 \ + encap ioam6 trace prealloc type 0x800000 ns 0 size 4 \ + dev veth0 &>/dev/null + + ip -netns $tmp_node route add 2001:db8:3::/64 \ + encap rpl segs 2001:db8:3::1 dev veth0 &>/dev/null + + ip -netns $tmp_node route add 2001:db8:4::/64 \ + encap seg6 mode inline segs 2001:db8:4::1 dev veth0 &>/dev/null + + ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ila" + skip_ila=$? + + ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ioam6" + skip_ioam6=$? + + ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap rpl" + skip_rpl=$? + + ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap seg6" + skip_seg6=$? + + cleanup_ns $tmp_node +} + +setup() +{ + setup_ns alpha beta gamma &>/dev/null + + ip link add name veth-alpha netns $alpha type veth \ + peer name veth-betaL netns $beta &>/dev/null + + ip link add name veth-betaR netns $beta type veth \ + peer name veth-gamma netns $gamma &>/dev/null + + ip -netns $alpha link set veth-alpha name veth0 &>/dev/null + ip -netns $beta link set veth-betaL name veth0 &>/dev/null + ip -netns $beta link set veth-betaR name veth1 &>/dev/null + ip -netns $gamma link set veth-gamma name veth0 &>/dev/null + + ip -netns $alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null + ip -netns $alpha link set veth0 up &>/dev/null + ip -netns $alpha link set lo up &>/dev/null + ip -netns $alpha route add 2001:db8:2::/64 \ + via 2001:db8:1::1 dev veth0 &>/dev/null + + ip -netns $beta addr add 2001:db8:1::1/64 dev veth0 &>/dev/null + ip -netns $beta addr add 2001:db8:2::1/64 dev veth1 &>/dev/null + ip -netns $beta link set veth0 up &>/dev/null + ip -netns $beta link set veth1 up &>/dev/null + ip -netns $beta link set lo up &>/dev/null + ip -netns $beta route del 2001:db8:2::/64 + ip -netns $beta route add 2001:db8:2::/64 dev veth1 + ip netns exec $beta \ + sysctl -wq net.ipv6.conf.all.forwarding=1 &>/dev/null + + ip -netns $gamma addr add 2001:db8:2::2/64 dev veth0 &>/dev/null + ip -netns $gamma link set veth0 up &>/dev/null + ip -netns $gamma link set lo up &>/dev/null + ip -netns $gamma route add 2001:db8:1::/64 \ + via 2001:db8:2::1 dev veth0 &>/dev/null + + sleep 1 + + ip netns exec $alpha ping6 -c 5 -W 1 2001:db8:2::2 &>/dev/null + if [ $? != 0 ]; then + echo "SKIP: Setup failed." + exit $ksft_skip + fi + + sleep 1 +} + +cleanup() +{ + cleanup_ns $alpha $beta $gamma + [ $ila_lsmod != 0 ] && modprobe -r ila &>/dev/null +} + +run_ila() +{ + if [ $skip_ila != 0 ]; then + echo "SKIP: ila (output)" + return + fi + + ip -netns $beta route del 2001:db8:2::/64 + ip -netns $beta route add 2001:db8:2:0:0:0:0:2/128 \ + encap ila 2001:db8:2:0 csum-mode no-action ident-type luid \ + hook-type output \ + dev veth1 &>/dev/null + sleep 1 + + echo "TEST: ila (output)" + ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 + + ip -netns $beta route del 2001:db8:2:0:0:0:0:2/128 + ip -netns $beta route add 2001:db8:2::/64 dev veth1 + sleep 1 +} + +run_ioam6() +{ + if [ $skip_ioam6 != 0 ]; then + echo "SKIP: ioam6 (output)" + return + fi + + ip -netns $beta route change 2001:db8:2::/64 \ + encap ioam6 trace prealloc type 0x800000 ns 1 size 4 \ + dev veth1 &>/dev/null + sleep 1 + + echo "TEST: ioam6 (output)" + ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 +} + +run_rpl() +{ + if [ $skip_rpl != 0 ]; then + echo "SKIP: rpl (input)" + echo "SKIP: rpl (output)" + return + fi + + ip -netns $beta route change 2001:db8:2::/64 \ + encap rpl segs 2001:db8:2::2 \ + dev veth1 &>/dev/null + sleep 1 + + echo "TEST: rpl (input)" + ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 + + echo "TEST: rpl (output)" + ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 +} + +run_seg6() +{ + if [ $skip_seg6 != 0 ]; then + echo "SKIP: seg6 (input)" + echo "SKIP: seg6 (output)" + return + fi + + ip -netns $beta route change 2001:db8:2::/64 \ + encap seg6 mode inline segs 2001:db8:2::2 \ + dev veth1 &>/dev/null + sleep 1 + + echo "TEST: seg6 (input)" + ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 + + echo "TEST: seg6 (output)" + ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 +} + +run() +{ + run_ila + run_ioam6 + run_rpl + run_seg6 +} + +if [ "$(id -u)" -ne 0 ]; then + echo "SKIP: Need root privileges." + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool." + exit $ksft_skip +fi + +check_compatibility + +trap cleanup EXIT + +setup +run + +exit $ksft_pass From 2c1f97a52cb827a5f2768e67a9dddffae1ed47ab Mon Sep 17 00:00:00 2001 From: Arthur Mongodin Date: Fri, 14 Mar 2025 21:11:31 +0100 Subject: [PATCH 28/33] mptcp: Fix data stream corruption in the address announcement Because of the size restriction in the TCP options space, the MPTCP ADD_ADDR option is exclusive and cannot be sent with other MPTCP ones. For this reason, in the linked mptcp_out_options structure, group of fields linked to different options are part of the same union. There is a case where the mptcp_pm_add_addr_signal() function can modify opts->addr, but not ended up sending an ADD_ADDR. Later on, back in mptcp_established_options, other options will be sent, but with unexpected data written in other fields due to the union, e.g. in opts->ext_copy. This could lead to a data stream corruption in the next packet. Using an intermediate variable, prevents from corrupting previously established DSS option. The assignment of the ADD_ADDR option parameters is now done once we are sure this ADD_ADDR option can be set in the packet, e.g. after having dropped other suboptions. Fixes: 1bff1e43a30e ("mptcp: optimize out option generation") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Signed-off-by: Arthur Mongodin Reviewed-by: Matthieu Baerts (NGI0) [ Matt: the commit message has been updated: long lines splits and some clarifications. ] Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250314-net-mptcp-fix-data-stream-corr-sockopt-v1-1-122dbb249db3@kernel.org Signed-off-by: Paolo Abeni --- net/mptcp/options.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index fd2de185bc939..23949ae2a3a8d 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -651,6 +651,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; + struct mptcp_addr_info addr; bool echo; int len; @@ -659,7 +660,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || - !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, + !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; @@ -672,7 +673,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * else if (opts->suboptions & OPTION_MPTCP_DSS) return false; - len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); + len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; @@ -689,6 +690,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->ahmac = 0; *size -= opt_size; } + opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); From 23b763302ce068d7c97441e75434dc9f903adc7d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 14 Mar 2025 22:41:54 +0100 Subject: [PATCH 29/33] tools headers: Sync uapi/asm-generic/socket.h with the kernel sources This also fixes a wrong definitions for SCM_TS_OPT_ID & SO_RCVPRIORITY. Accidentally found while working on another patchset. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Vadim Fedorenko Cc: Willem de Bruijn Cc: Jason Xing Cc: Anna Emese Nyiri Cc: Kuniyuki Iwashima Cc: Paolo Abeni Fixes: a89568e9be75 ("selftests: txtimestamp: add SCM_TS_OPT_ID test") Fixes: e45469e594b2 ("sock: Introduce SO_RCVPRIORITY socket option") Link: https://lore.kernel.org/netdev/20250314195257.34854-1-kuniyu@amazon.com/ Reviewed-by: Kuniyuki Iwashima Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250314214155.16046-1-aleksandr.mikhalitsyn@canonical.com Signed-off-by: Paolo Abeni --- tools/include/uapi/asm-generic/socket.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index ffff554a52300..aa5016ff3d911 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -119,14 +119,31 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 + +#define SO_NETNS_COOKIE 71 + +#define SO_BUF_LOCK 72 + +#define SO_RESERVE_MEM 73 + +#define SO_TXREHASH 74 + #define SO_RCVMARK 75 #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 -#define SCM_TS_OPT_ID 78 +#define SO_DEVMEM_LINEAR 78 +#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR +#define SO_DEVMEM_DMABUF 79 +#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF +#define SO_DEVMEM_DONTNEED 80 + +#define SCM_TS_OPT_ID 81 -#define SO_RCVPRIORITY 79 +#define SO_RCVPRIORITY 82 #if !defined(__KERNEL__) From 90a7138619a0c55e2aefaad27b12ffc2ddbeed78 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Sun, 16 Mar 2025 00:51:13 +0800 Subject: [PATCH 30/33] net/neighbor: add missing policy for NDTPA_QUEUE_LENBYTES Previous commit 8b5c171bb3dc ("neigh: new unresolved queue limits") introduces new netlink attribute NDTPA_QUEUE_LENBYTES to represent approximative value for deprecated QUEUE_LEN. However, it forgot to add the associated nla_policy in nl_ntbl_parm_policy array. Fix it with one simple NLA_U32 type policy. Fixes: 8b5c171bb3dc ("neigh: new unresolved queue limits") Signed-off-by: Lin Ma Link: https://patch.msgid.link/20250315165113.37600-1-linma@zju.edu.cn Signed-off-by: Paolo Abeni --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bd0251bd74a1f..1a620f903c56e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2250,6 +2250,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, From 355d940f4d5ae56ed082a455ce2cc737c7e898e8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 19 Mar 2025 22:26:45 +0100 Subject: [PATCH 31/33] Revert "selftests: Add IPv6 link-local address generation tests for GRE devices." This reverts commit 6f50175ccad4278ed3a9394c00b797b75441bd6e. Commit 183185a18ff9 ("gre: Fix IPv6 link-local address generation.") is going to be reverted. So let's revert the corresponding kselftest first. Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/259a9e98f7f1be7ce02b53d0b4afb7c18a8ff747.1742418408.git.gnault@redhat.com Acked-by: Stanislav Fomichev Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/Makefile | 1 - .../testing/selftests/net/gre_ipv6_lladdr.sh | 177 ------------------ 2 files changed, 178 deletions(-) delete mode 100755 tools/testing/selftests/net/gre_ipv6_lladdr.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 843ab747645d0..8f32b4f01aee1 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -31,7 +31,6 @@ TEST_PROGS += veth.sh TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh TEST_PROGS += gre_gso.sh -TEST_PROGS += gre_ipv6_lladdr.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_so_priority.sh TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh diff --git a/tools/testing/selftests/net/gre_ipv6_lladdr.sh b/tools/testing/selftests/net/gre_ipv6_lladdr.sh deleted file mode 100755 index 5b34f6e1f8314..0000000000000 --- a/tools/testing/selftests/net/gre_ipv6_lladdr.sh +++ /dev/null @@ -1,177 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source ./lib.sh - -PAUSE_ON_FAIL="no" - -# The trap function handler -# -exit_cleanup_all() -{ - cleanup_all_ns - - exit "${EXIT_STATUS}" -} - -# Add fake IPv4 and IPv6 networks on the loopback device, to be used as -# underlay by future GRE devices. -# -setup_basenet() -{ - ip -netns "${NS0}" link set dev lo up - ip -netns "${NS0}" address add dev lo 192.0.2.10/24 - ip -netns "${NS0}" address add dev lo 2001:db8::10/64 nodad -} - -# Check if network device has an IPv6 link-local address assigned. -# -# Parameters: -# -# * $1: The network device to test -# * $2: An extra regular expression that should be matched (to verify the -# presence of extra attributes) -# * $3: The expected return code from grep (to allow checking the absence of -# a link-local address) -# * $4: The user visible name for the scenario being tested -# -check_ipv6_ll_addr() -{ - local DEV="$1" - local EXTRA_MATCH="$2" - local XRET="$3" - local MSG="$4" - - RET=0 - set +e - ip -netns "${NS0}" -6 address show dev "${DEV}" scope link | grep "fe80::" | grep -q "${EXTRA_MATCH}" - check_err_fail "${XRET}" $? "" - log_test "${MSG}" - set -e -} - -# Create a GRE device and verify that it gets an IPv6 link-local address as -# expected. -# -# Parameters: -# -# * $1: The device type (gre, ip6gre, gretap or ip6gretap) -# * $2: The local underlay IP address (can be an IPv4, an IPv6 or "any") -# * $3: The remote underlay IP address (can be an IPv4, an IPv6 or "any") -# * $4: The IPv6 interface identifier generation mode to use for the GRE -# device (eui64, none, stable-privacy or random). -# -test_gre_device() -{ - local GRE_TYPE="$1" - local LOCAL_IP="$2" - local REMOTE_IP="$3" - local MODE="$4" - local ADDR_GEN_MODE - local MATCH_REGEXP - local MSG - - ip link add netns "${NS0}" name gretest type "${GRE_TYPE}" local "${LOCAL_IP}" remote "${REMOTE_IP}" - - case "${MODE}" in - "eui64") - ADDR_GEN_MODE=0 - MATCH_REGEXP="" - MSG="${GRE_TYPE}, mode: 0 (EUI64), ${LOCAL_IP} -> ${REMOTE_IP}" - XRET=0 - ;; - "none") - ADDR_GEN_MODE=1 - MATCH_REGEXP="" - MSG="${GRE_TYPE}, mode: 1 (none), ${LOCAL_IP} -> ${REMOTE_IP}" - XRET=1 # No link-local address should be generated - ;; - "stable-privacy") - ADDR_GEN_MODE=2 - MATCH_REGEXP="stable-privacy" - MSG="${GRE_TYPE}, mode: 2 (stable privacy), ${LOCAL_IP} -> ${REMOTE_IP}" - XRET=0 - # Initialise stable_secret (required for stable-privacy mode) - ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.stable_secret="2001:db8::abcd" - ;; - "random") - ADDR_GEN_MODE=3 - MATCH_REGEXP="stable-privacy" - MSG="${GRE_TYPE}, mode: 3 (random), ${LOCAL_IP} -> ${REMOTE_IP}" - XRET=0 - ;; - esac - - # Check that IPv6 link-local address is generated when device goes up - ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}" - ip -netns "${NS0}" link set dev gretest up - check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "config: ${MSG}" - - # Now disable link-local address generation - ip -netns "${NS0}" link set dev gretest down - ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode=1 - ip -netns "${NS0}" link set dev gretest up - - # Check that link-local address generation works when re-enabled while - # the device is already up - ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}" - check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "update: ${MSG}" - - ip -netns "${NS0}" link del dev gretest -} - -test_gre4() -{ - local GRE_TYPE - local MODE - - for GRE_TYPE in "gre" "gretap"; do - printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n" - - for MODE in "eui64" "none" "stable-privacy" "random"; do - test_gre_device "${GRE_TYPE}" 192.0.2.10 192.0.2.11 "${MODE}" - test_gre_device "${GRE_TYPE}" any 192.0.2.11 "${MODE}" - test_gre_device "${GRE_TYPE}" 192.0.2.10 any "${MODE}" - done - done -} - -test_gre6() -{ - local GRE_TYPE - local MODE - - for GRE_TYPE in "ip6gre" "ip6gretap"; do - printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n" - - for MODE in "eui64" "none" "stable-privacy" "random"; do - test_gre_device "${GRE_TYPE}" 2001:db8::10 2001:db8::11 "${MODE}" - test_gre_device "${GRE_TYPE}" any 2001:db8::11 "${MODE}" - test_gre_device "${GRE_TYPE}" 2001:db8::10 any "${MODE}" - done - done -} - -usage() -{ - echo "Usage: $0 [-p]" - exit 1 -} - -while getopts :p o -do - case $o in - p) PAUSE_ON_FAIL="yes";; - *) usage;; - esac -done - -setup_ns NS0 - -set -e -trap exit_cleanup_all EXIT - -setup_basenet - -test_gre4 -test_gre6 From fc486c2d060f67d672ddad81724f7c8a4d329570 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 19 Mar 2025 22:26:50 +0100 Subject: [PATCH 32/33] Revert "gre: Fix IPv6 link-local address generation." This reverts commit 183185a18ff96751db52a46ccf93fff3a1f42815. This patch broke net/forwarding/ip6gre_custom_multipath_hash.sh in some circumstances (https://lore.kernel.org/netdev/Z9RIyKZDNoka53EO@mini-arch/). Let's revert it while the problem is being investigated. Fixes: 183185a18ff9 ("gre: Fix IPv6 link-local address generation.") Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/8b1ce738eb15dd841aab9ef888640cab4f6ccfea.1742418408.git.gnault@redhat.com Acked-by: Stanislav Fomichev Signed-off-by: Paolo Abeni --- net/ipv6/addrconf.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8b6258819dade..ac8cc10765360 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3209,13 +3209,16 @@ static void add_v4_addrs(struct inet6_dev *idev) struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen; + int scope, plen, offset = 0; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ + if (idev->dev->addr_len == sizeof(struct in6_addr)) + offset = sizeof(struct in6_addr) - 4; + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { scope = IPV6_ADDR_COMPATv4; @@ -3526,13 +3529,7 @@ static void addrconf_gre_config(struct net_device *dev) return; } - /* Generate the IPv6 link-local address using addrconf_addr_gen(), - * unless we have an IPv4 GRE device not bound to an IP address and - * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this - * case). Such devices fall back to add_v4_addrs() instead. - */ - if (!(dev->type == ARPHRD_IPGRE && *(__be32 *)dev->dev_addr == 0 && - idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) { + if (dev->type == ARPHRD_ETHER) { addrconf_addr_gen(idev, true); return; } From feaee98c6c505494e2188e5c644b881f5c81ee59 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 12 Mar 2025 10:22:12 +0100 Subject: [PATCH 33/33] MAINTAINERS: Add Andrea Mayer as a maintainer of SRv6 Andrea has made significant contributions to SRv6 support in Linux. Acknowledge the work and on-going interest in Srv6 support with a maintainers entry for these files so hopefully he is included on patches going forward. Signed-off-by: David Ahern Acked-by: Andrea Mayer Link: https://patch.msgid.link/20250312092212.46299-1-dsahern@kernel.org Signed-off-by: Paolo Abeni --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 625f3758c8800..1ca95d95c4892 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16663,6 +16663,17 @@ F: net/mptcp/ F: tools/testing/selftests/bpf/*/*mptcp*.[ch] F: tools/testing/selftests/net/mptcp/ +NETWORKING [SRv6] +M: Andrea Mayer +L: netdev@vger.kernel.org +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git +F: include/linux/seg6* +F: include/net/seg6* +F: include/uapi/linux/seg6* +F: net/ipv6/seg6* +F: tools/testing/selftests/net/srv6* + NETWORKING [TCP] M: Eric Dumazet M: Neal Cardwell