From b9495b564d91a0afe4125db64d2bc25c310bda9c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:18 +0000 Subject: [PATCH 1/8] net: move kick_defer_list_purge() to net/core/dev.h kick_defer_list_purge() is defined in net/core/dev.c and used from net/core/skubff.c Because we need softnet_data, include from net/core/dev.h Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.h | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 55c7cf9404a4a..15e70527b703d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3288,7 +3288,6 @@ static inline void dev_xmit_recursion_dec(void) __this_cpu_dec(softnet_data.xmit.recursion); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); diff --git a/net/core/dev.h b/net/core/dev.h index 2bcaf8eee50c1..9d0f8b4587f81 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -4,11 +4,9 @@ #include #include +#include struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; @@ -150,4 +148,6 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif struct napi_struct *napi_by_id(unsigned int napi_id); +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + #endif From 2fe50a4d7225cf10748775e290361896637091a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:19 +0000 Subject: [PATCH 2/8] net: move dev_xmit_recursion() helpers to net/core/dev.h Move dev_xmit_recursion() and friends to net/core/dev.h They are only used from net/core/dev.c and net/core/filter.c. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 17 ----------------- net/core/dev.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 15e70527b703d..0cd9ee83f5541 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3271,23 +3271,6 @@ static inline int dev_recursion_level(void) return this_cpu_read(softnet_data.xmit.recursion); } -#define XMIT_RECURSION_LIMIT 8 -static inline bool dev_xmit_recursion(void) -{ - return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > - XMIT_RECURSION_LIMIT); -} - -static inline void dev_xmit_recursion_inc(void) -{ - __this_cpu_inc(softnet_data.xmit.recursion); -} - -static inline void dev_xmit_recursion_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.recursion); -} - void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); diff --git a/net/core/dev.h b/net/core/dev.h index 9d0f8b4587f81..8572d2c8dc4ad 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -150,4 +150,21 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { } struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +#define XMIT_RECURSION_LIMIT 8 +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} + #endif From 95e48d862ada73188be6d91a33c49d1712815bd2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:20 +0000 Subject: [PATCH 3/8] net: enqueue_to_backlog() change vs not running device If the device attached to the packet given to enqueue_to_backlog() is not running, we drop the packet. But we accidentally increase sd->dropped, giving false signals to admins: sd->dropped should be reserved to cpu backlog pressure, not to temporary glitches at device dismantles. While we are at it, perform the netif_running() test before we get the rps lock, and use REASON_DEV_READY drop reason instead of NOT_SPECIFIED. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index c136e80dea618..4ad7836365e68 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4801,12 +4801,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned long flags; unsigned int qlen; - reason = SKB_DROP_REASON_NOT_SPECIFIED; + reason = SKB_DROP_REASON_DEV_READY; + if (!netif_running(skb->dev)) + goto bad_dev; + sd = &per_cpu(softnet_data, cpu); backlog_lock_irq_save(sd, &flags); - if (!netif_running(skb->dev)) - goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= READ_ONCE(net_hotdata.max_backlog) && !skb_flow_limit(skb, qlen)) { @@ -4827,10 +4828,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, } reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: sd->dropped++; backlog_unlock_irq_restore(sd, &flags); +bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; From a7ae7b0b2ea014ff3ed4be812c3efa1b1d86e153 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:21 +0000 Subject: [PATCH 4/8] net: make softnet_data.dropped an atomic_t If under extreme cpu backlog pressure enqueue_to_backlog() has to drop a packet, it could do this without dirtying a cache line and potentially slowing down the target cpu. Move sd->dropped into a separate cache line, and make it atomic. In non pressure mode, this field is not touched, no need to consume valuable space in a hot cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- net/core/dev.c | 13 +++++++++---- net/core/net-procfs.c | 3 ++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0cd9ee83f5541..be9f071a340d2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3237,10 +3237,11 @@ struct softnet_data { unsigned int input_queue_tail; #endif unsigned int received_rps; - unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + atomic_t dropped ____cacheline_aligned_in_smp; + /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; diff --git a/net/core/dev.c b/net/core/dev.c index 4ad7836365e68..02c98f1152432 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4800,17 +4800,22 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) goto bad_dev; + reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog)) + goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(net_hotdata.max_backlog) && - !skb_flow_limit(skb, qlen)) { + if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -4826,11 +4831,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, napi_schedule_rps(sd); goto enqueue; } - reason = SKB_DROP_REASON_CPU_BACKLOG; - sd->dropped++; backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: + atomic_inc(&sd->dropped); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index a97eceb84e61e..fa6d3969734a6 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -144,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + sd->processed, atomic_read(&sd->dropped), + sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ sd->received_rps, flow_limit_count, From f7efd01fe21431a98677e5505e5de46649121ca7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:22 +0000 Subject: [PATCH 5/8] net: enqueue_to_backlog() cleanup We can remove a goto and a label by reversing a condition. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 02c98f1152432..0a8ccb0451c30 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4816,20 +4816,18 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { - if (qlen) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - backlog_unlock_irq_restore(sd, &flags); - return NET_RX_SUCCESS; + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); } - - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) - napi_schedule_rps(sd); - goto enqueue; + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); + backlog_unlock_irq_restore(sd, &flags); + return NET_RX_SUCCESS; } backlog_unlock_irq_restore(sd, &flags); From 36b83ffcf209a2e6099dae1070df6a2001dfab27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:23 +0000 Subject: [PATCH 6/8] net: rps: change input_queue_tail_incr_save() input_queue_tail_incr_save() is incrementing the sd queue_tail and save it in the flow last_qtail. Two issues here : - no lock protects the write on last_qtail, we should use appropriate annotations. - We can perform this write after releasing the per-cpu backlog lock, to decrease this lock hold duration (move away the cache line miss) Also move input_queue_head_incr() and rps helpers to include/net/rps.h, while adding rps_ prefix to better reflect their role. v2: Fixed a build issue (Jakub and kernel build bots) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 15 --------------- include/net/rps.h | 23 +++++++++++++++++++++++ net/core/dev.c | 20 ++++++++++++-------- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be9f071a340d2..2c11c295cf64a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3250,21 +3250,6 @@ struct softnet_data { call_single_data_t defer_csd; }; -static inline void input_queue_head_incr(struct softnet_data *sd) -{ -#ifdef CONFIG_RPS - sd->input_queue_head++; -#endif -} - -static inline void input_queue_tail_incr_save(struct softnet_data *sd, - unsigned int *qtail) -{ -#ifdef CONFIG_RPS - *qtail = ++sd->input_queue_tail; -#endif -} - DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); static inline int dev_recursion_level(void) diff --git a/include/net/rps.h b/include/net/rps.h index 7660243e905b9..10ca25731c1ef 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk) #endif } +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return ++sd->input_queue_tail; +#else + return 0; +#endif +} + +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(*dest, tail); +#endif +} + +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + sd->input_queue_head++; +#endif +} + #endif /* _NET_RPS_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 0a8ccb0451c30..79073bbc9a644 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4611,7 +4611,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) { + READ_ONCE(rflow->last_qtail))) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } @@ -4666,7 +4666,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < + READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } @@ -4801,6 +4801,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned long flags; unsigned int qlen; int max_backlog; + u32 tail; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) @@ -4825,8 +4826,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, napi_schedule_rps(sd); } __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); + tail = rps_input_queue_tail_incr(sd); backlog_unlock_irq_restore(sd, &flags); + + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } @@ -5904,7 +5908,7 @@ static void flush_backlog(struct work_struct *work) if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } backlog_unlock_irq_enable(sd); @@ -5913,7 +5917,7 @@ static void flush_backlog(struct work_struct *work) if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } local_bh_enable(); @@ -6041,7 +6045,7 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); if (++work >= quota) return work; @@ -11455,11 +11459,11 @@ static int dev_cpu_dead(unsigned int oldcpu) /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0; From c62fdf5b11ef12b89ac2450c25c12bffc3d924c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:24 +0000 Subject: [PATCH 7/8] net: rps: add rps_input_queue_head_add() helper process_backlog() can batch increments of sd->input_queue_head, saving some memory bandwidth. Also add READ_ONCE()/WRITE_ONCE() annotations around sd->input_queue_head accesses. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/rps.h | 9 +++++++-- net/core/dev.c | 13 ++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/net/rps.h b/include/net/rps.h index 10ca25731c1ef..a93401d23d66e 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -138,11 +138,16 @@ static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) #endif } -static inline void rps_input_queue_head_incr(struct softnet_data *sd) +static inline void rps_input_queue_head_add(struct softnet_data *sd, int val) { #ifdef CONFIG_RPS - sd->input_queue_head++; + WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val); #endif } +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ + rps_input_queue_head_add(sd, 1); +} + #endif /* _NET_RPS_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 79073bbc9a644..818699dea9d70 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4528,7 +4528,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, out: #endif rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); } rflow->cpu = next_cpu; @@ -4610,7 +4610,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - READ_ONCE(rflow->last_qtail))) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); @@ -4665,7 +4665,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; @@ -6045,9 +6045,10 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - rps_input_queue_head_incr(sd); - if (++work >= quota) + if (++work >= quota) { + rps_input_queue_head_add(sd, work); return work; + } } @@ -6070,6 +6071,8 @@ static int process_backlog(struct napi_struct *napi, int quota) backlog_unlock_irq_enable(sd); } + if (work) + rps_input_queue_head_add(sd, work); return work; } From d3ae5f4632c107d3c2eeb97a60fecc6a6f9d6fbe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:25 +0000 Subject: [PATCH 8/8] net: rps: move received_rps field to a better location Commit 14d898f3c1b3 ("dev: Move received_rps counter next to RPS members in softnet data") was unfortunate: received_rps is dirtied by a cpu and never read by other cpus in fast path. Its presence in the hot RPS cache line (shared by many cpus) is hurting RPS/RFS performance. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2c11c295cf64a..7d12b5a9380f4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3204,6 +3204,7 @@ struct softnet_data { struct softnet_data *rps_ipi_list; #endif + unsigned int received_rps; bool in_net_rx_action; bool in_napi_threaded_poll; @@ -3236,7 +3237,6 @@ struct softnet_data { unsigned int cpu; unsigned int input_queue_tail; #endif - unsigned int received_rps; struct sk_buff_head input_pkt_queue; struct napi_struct backlog;