From 1add90738cf58def972356ecf402e36397187db7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Sep 2023 20:17:11 +0000 Subject: [PATCH 1/5] net_sched: constify qdisc_priv() In order to propagate const qualifiers, we change qdisc_priv() to accept a possibly const argument. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 15960564e0c36..9fa1d0794dfa5 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -20,10 +20,10 @@ struct qdisc_walker { int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; -static inline void *qdisc_priv(struct Qdisc *q) -{ - return &q->privdata; -} +#define qdisc_priv(q) \ + _Generic(q, \ + const struct Qdisc * : (const void *)&q->privdata, \ + struct Qdisc * : (void *)&q->privdata) static inline struct Qdisc *qdisc_from_priv(void *priv) { From 54ff8ad69c6e93c0767451ae170b41c000e565dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Sep 2023 20:17:12 +0000 Subject: [PATCH 2/5] net_sched: sch_fq: struct sched_data reorg q->flows can be often modified, and q->timer_slack is read mostly. Exchange the two fields, so that cache line countaining quantum, initial_quantum, and other critical parameters stay clean (read-mostly). Move q->watchdog next to q->stat_throttled Add comments explaining how the structure is split in three different parts. pahole output before the patch: struct fq_sched_data { struct fq_flow_head new_flows; /* 0 0x10 */ struct fq_flow_head old_flows; /* 0x10 0x10 */ struct rb_root delayed; /* 0x20 0x8 */ u64 time_next_delayed_flow; /* 0x28 0x8 */ u64 ktime_cache; /* 0x30 0x8 */ unsigned long unthrottle_latency_ns; /* 0x38 0x8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct fq_flow internal __attribute__((__aligned__(64))); /* 0x40 0x80 */ /* XXX last struct has 16 bytes of padding */ /* --- cacheline 3 boundary (192 bytes) --- */ u32 quantum; /* 0xc0 0x4 */ u32 initial_quantum; /* 0xc4 0x4 */ u32 flow_refill_delay; /* 0xc8 0x4 */ u32 flow_plimit; /* 0xcc 0x4 */ unsigned long flow_max_rate; /* 0xd0 0x8 */ u64 ce_threshold; /* 0xd8 0x8 */ u64 horizon; /* 0xe0 0x8 */ u32 orphan_mask; /* 0xe8 0x4 */ u32 low_rate_threshold; /* 0xec 0x4 */ struct rb_root * fq_root; /* 0xf0 0x8 */ u8 rate_enable; /* 0xf8 0x1 */ u8 fq_trees_log; /* 0xf9 0x1 */ u8 horizon_drop; /* 0xfa 0x1 */ /* XXX 1 byte hole, try to pack */ u32 flows; /* 0xfc 0x4 */ /* --- cacheline 4 boundary (256 bytes) --- */ u32 inactive_flows; /* 0x100 0x4 */ u32 throttled_flows; /* 0x104 0x4 */ u64 stat_gc_flows; /* 0x108 0x8 */ u64 stat_internal_packets; /* 0x110 0x8 */ u64 stat_throttled; /* 0x118 0x8 */ u64 stat_ce_mark; /* 0x120 0x8 */ u64 stat_horizon_drops; /* 0x128 0x8 */ u64 stat_horizon_caps; /* 0x130 0x8 */ u64 stat_flows_plimit; /* 0x138 0x8 */ /* --- cacheline 5 boundary (320 bytes) --- */ u64 stat_pkts_too_long; /* 0x140 0x8 */ u64 stat_allocation_errors; /* 0x148 0x8 */ u32 timer_slack; /* 0x150 0x4 */ /* XXX 4 bytes hole, try to pack */ struct qdisc_watchdog watchdog; /* 0x158 0x48 */ /* size: 448, cachelines: 7, members: 34 */ /* sum members: 411, holes: 2, sum holes: 5 */ /* padding: 32 */ /* paddings: 1, sum paddings: 16 */ /* forced alignments: 1 */ }; pahole output after the patch: struct fq_sched_data { struct fq_flow_head new_flows; /* 0 0x10 */ struct fq_flow_head old_flows; /* 0x10 0x10 */ struct rb_root delayed; /* 0x20 0x8 */ u64 time_next_delayed_flow; /* 0x28 0x8 */ u64 ktime_cache; /* 0x30 0x8 */ unsigned long unthrottle_latency_ns; /* 0x38 0x8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct fq_flow internal __attribute__((__aligned__(64))); /* 0x40 0x80 */ /* XXX last struct has 16 bytes of padding */ /* --- cacheline 3 boundary (192 bytes) --- */ u32 quantum; /* 0xc0 0x4 */ u32 initial_quantum; /* 0xc4 0x4 */ u32 flow_refill_delay; /* 0xc8 0x4 */ u32 flow_plimit; /* 0xcc 0x4 */ unsigned long flow_max_rate; /* 0xd0 0x8 */ u64 ce_threshold; /* 0xd8 0x8 */ u64 horizon; /* 0xe0 0x8 */ u32 orphan_mask; /* 0xe8 0x4 */ u32 low_rate_threshold; /* 0xec 0x4 */ struct rb_root * fq_root; /* 0xf0 0x8 */ u8 rate_enable; /* 0xf8 0x1 */ u8 fq_trees_log; /* 0xf9 0x1 */ u8 horizon_drop; /* 0xfa 0x1 */ /* XXX 1 byte hole, try to pack */ u32 timer_slack; /* 0xfc 0x4 */ /* --- cacheline 4 boundary (256 bytes) --- */ u32 flows; /* 0x100 0x4 */ u32 inactive_flows; /* 0x104 0x4 */ u32 throttled_flows; /* 0x108 0x4 */ /* XXX 4 bytes hole, try to pack */ u64 stat_throttled; /* 0x110 0x8 */ struct qdisc_watchdog watchdog; /* 0x118 0x48 */ /* --- cacheline 5 boundary (320 bytes) was 32 bytes ago --- */ u64 stat_gc_flows; /* 0x160 0x8 */ u64 stat_internal_packets; /* 0x168 0x8 */ u64 stat_ce_mark; /* 0x170 0x8 */ u64 stat_horizon_drops; /* 0x178 0x8 */ /* --- cacheline 6 boundary (384 bytes) --- */ u64 stat_horizon_caps; /* 0x180 0x8 */ u64 stat_flows_plimit; /* 0x188 0x8 */ u64 stat_pkts_too_long; /* 0x190 0x8 */ u64 stat_allocation_errors; /* 0x198 0x8 */ /* Force padding: */ u64 :64; u64 :64; u64 :64; u64 :64; /* size: 448, cachelines: 7, members: 34 */ /* sum members: 411, holes: 2, sum holes: 5 */ /* padding: 32 */ /* paddings: 1, sum paddings: 16 */ /* forced alignments: 1 */ }; Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f59a2cb2c803d..230300aac3ed1 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -104,6 +104,9 @@ struct fq_sched_data { unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ + +/* Read mostly cache line */ + u32 quantum; u32 initial_quantum; u32 flow_refill_delay; @@ -117,22 +120,27 @@ struct fq_sched_data { u8 rate_enable; u8 fq_trees_log; u8 horizon_drop; + u32 timer_slack; /* hrtimer slack in ns */ + +/* Read/Write fields. */ + u32 flows; u32 inactive_flows; u32 throttled_flows; + u64 stat_throttled; + struct qdisc_watchdog watchdog; u64 stat_gc_flows; + +/* Seldom used fields. */ + u64 stat_internal_packets; - u64 stat_throttled; u64 stat_ce_mark; u64 stat_horizon_drops; u64 stat_horizon_caps; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; - - u32 timer_slack; /* hrtimer slack in ns */ - struct qdisc_watchdog watchdog; }; /* From ee9af4e14d166c34ec78896bb1ba545249445df0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Sep 2023 20:17:13 +0000 Subject: [PATCH 3/5] net_sched: sch_fq: change how @inactive is tracked Currently, when one fq qdisc has no more packets to send, it can still have some flows stored in its RR lists (q->new_flows & q->old_flows) This was a design choice, but what is a bit disturbing is that the inactive_flows counter does not include the count of empty flows in RR lists. As next patch needs to know better if there are active flows, this change makes inactive_flows exact. Before the patch, following command on an empty qdisc could have returned: lpaa17:~# tc -s -d qd sh dev eth1 | grep inactive flows 1322 (inactive 1316 throttled 0) flows 1330 (inactive 1325 throttled 0) flows 1193 (inactive 1190 throttled 0) flows 1208 (inactive 1202 throttled 0) After the patch, we now have: lpaa17:~# tc -s -d qd sh dev eth1 | grep inactive flows 1322 (inactive 1322 throttled 0) flows 1330 (inactive 1330 throttled 0) flows 1193 (inactive 1193 throttled 0) flows 1208 (inactive 1208 throttled 0) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 230300aac3ed1..4af43a401dbb4 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -125,7 +125,7 @@ struct fq_sched_data { /* Read/Write fields. */ u32 flows; - u32 inactive_flows; + u32 inactive_flows; /* Flows with no packet to send. */ u32 throttled_flows; u64 stat_throttled; @@ -402,9 +402,12 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb) { + struct fq_sched_data *q = qdisc_priv(sch); + fq_erase_head(sch, flow, skb); skb_mark_not_on_list(skb); - flow->qlen--; + if (--flow->qlen == 0) + q->inactive_flows++; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } @@ -484,13 +487,13 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); } - f->qlen++; + if (f->qlen++ == 0) + q->inactive_flows--; qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { fq_flow_add_tail(&q->new_flows, f); if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); - q->inactive_flows--; } /* Note: this overwrites f->age */ @@ -597,7 +600,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) fq_flow_add_tail(&q->old_flows, f); } else { fq_flow_set_detached(f); - q->inactive_flows++; } goto begin; } From 076433bd78d719b34d465c1e69eef512036b534c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Sep 2023 20:17:14 +0000 Subject: [PATCH 4/5] net_sched: sch_fq: add fast path for mostly idle qdisc TCQ_F_CAN_BYPASS can be used by few qdiscs. Idea is that if we queue a packet to an empty qdisc, following dequeue() would pick it immediately. FQ can not use the generic TCQ_F_CAN_BYPASS code, because some additional checks need to be performed. This patch adds a similar fast path to FQ. Most of the time, qdisc is not throttled, and many packets can avoid bringing/touching at least four cache lines, and consuming 128bytes of memory to store the state of a flow. After this patch, netperf can send UDP packets about 13 % faster, and pktgen goes 30 % faster (when FQ is in the way), on a fast NIC. TCP traffic is also improved, thanks to a reduction of cache line misses. I have measured a 5 % increase of throughput on a tcp_rr intensive workload. tc -s -d qd sh dev eth1 ... qdisc fq 8004: parent 1:2 limit 10000p flow_limit 100p buckets 1024 orphan_mask 1023 quantum 3028b initial_quantum 15140b low_rate_threshold 550Kbit refill_delay 40ms timer_slack 10us horizon 10s horizon_drop Sent 5646784384 bytes 1985161 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 flows 122 (inactive 122 throttled 0) gc 0 highprio 0 fastpath 659990 throttled 27762 latency 8.57us Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_fq.c | 128 +++++++++++++++++++++++---------- 2 files changed, 92 insertions(+), 37 deletions(-) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 3f85ae5780563..579f641846b87 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -962,6 +962,7 @@ struct tc_fq_qd_stats { __u64 ce_mark; /* packets above ce_threshold */ __u64 horizon_drops; __u64 horizon_caps; + __u64 fastpath_packets; }; /* Heavy-Hitter Filter */ diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4af43a401dbb4..5cf3b50a24d58 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -2,7 +2,7 @@ /* * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) * - * Copyright (C) 2013-2015 Eric Dumazet + * Copyright (C) 2013-2023 Eric Dumazet * * Meant to be mostly used for locally generated traffic : * Fast classification depends on skb->sk being set before reaching us. @@ -73,7 +73,13 @@ struct fq_flow { struct sk_buff *tail; /* last skb in the list */ unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ }; - struct rb_node fq_node; /* anchor in fq_root[] trees */ + union { + struct rb_node fq_node; /* anchor in fq_root[] trees */ + /* Following field is only used for q->internal, + * because q->internal is not hashed in fq_root[] + */ + u64 stat_fastpath_packets; + }; struct sock *sk; u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ @@ -134,7 +140,7 @@ struct fq_sched_data { /* Seldom used fields. */ - u64 stat_internal_packets; + u64 stat_internal_packets; /* aka highprio */ u64 stat_ce_mark; u64 stat_horizon_drops; u64 stat_horizon_caps; @@ -266,17 +272,64 @@ static void fq_gc(struct fq_sched_data *q, kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } -static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +/* Fast path can be used if : + * 1) Packet tstamp is in the past. + * 2) FQ qlen == 0 OR + * (no flow is currently eligible for transmit, + * AND fast path queue has less than 8 packets) + * 3) No SO_MAX_PACING_RATE on the socket (if any). + * 4) No @maxrate attribute on this qdisc, + * + * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure. + */ +static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb) +{ + const struct fq_sched_data *q = qdisc_priv(sch); + const struct sock *sk; + + if (fq_skb_cb(skb)->time_to_send > q->ktime_cache) + return false; + + if (sch->q.qlen != 0) { + /* Even if some packets are stored in this qdisc, + * we can still enable fast path if all of them are + * scheduled in the future (ie no flows are eligible) + * or in the fast path queue. + */ + if (q->flows != q->inactive_flows + q->throttled_flows) + return false; + + /* Do not allow fast path queue to explode, we want Fair Queue mode + * under pressure. + */ + if (q->internal.qlen >= 8) + return false; + } + + sk = skb->sk; + if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) && + sk->sk_max_pacing_rate != ~0UL) + return false; + + if (q->flow_max_rate != ~0UL) + return false; + + return true; +} + +static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb) { + struct fq_sched_data *q = qdisc_priv(sch); struct rb_node **p, *parent; struct sock *sk = skb->sk; struct rb_root *root; struct fq_flow *f; /* warning: no starvation prevention... */ - if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) + if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) { + q->stat_internal_packets++; /* highprio packet */ return &q->internal; - + } /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket * or a listener (SYNCOOKIE mode) * 1) request sockets are not full blown, @@ -307,6 +360,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) sk = (struct sock *)((hash << 1) | 1UL); } + if (fq_fastpath_check(sch, skb)) { + q->internal.stat_fastpath_packets++; + return &q->internal; + } + root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; if (q->flows >= (2U << q->fq_trees_log) && @@ -402,12 +460,8 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb) { - struct fq_sched_data *q = qdisc_priv(sch); - fq_erase_head(sch, flow, skb); skb_mark_not_on_list(skb); - if (--flow->qlen == 0) - q->inactive_flows++; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } @@ -459,49 +513,45 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); + q->ktime_cache = ktime_get_ns(); if (!skb->tstamp) { - fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); + fq_skb_cb(skb)->time_to_send = q->ktime_cache; } else { - /* Check if packet timestamp is too far in the future. - * Try first if our cached value, to avoid ktime_get_ns() - * cost in most cases. - */ + /* Check if packet timestamp is too far in the future. */ if (fq_packet_beyond_horizon(skb, q)) { - /* Refresh our cache and check another time */ - q->ktime_cache = ktime_get_ns(); - if (fq_packet_beyond_horizon(skb, q)) { - if (q->horizon_drop) { + if (q->horizon_drop) { q->stat_horizon_drops++; return qdisc_drop(skb, sch, to_free); - } - q->stat_horizon_caps++; - skb->tstamp = q->ktime_cache + q->horizon; } + q->stat_horizon_caps++; + skb->tstamp = q->ktime_cache + q->horizon; } fq_skb_cb(skb)->time_to_send = skb->tstamp; } - f = fq_classify(skb, q); - if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { - q->stat_flows_plimit++; - return qdisc_drop(skb, sch, to_free); - } + f = fq_classify(sch, skb); - if (f->qlen++ == 0) - q->inactive_flows--; - qdisc_qstats_backlog_inc(sch, skb); - if (fq_flow_is_detached(f)) { - fq_flow_add_tail(&q->new_flows, f); - if (time_after(jiffies, f->age + q->flow_refill_delay)) - f->credit = max_t(u32, f->credit, q->quantum); + if (f != &q->internal) { + if (unlikely(f->qlen >= q->flow_plimit)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch, to_free); + } + + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (time_after(jiffies, f->age + q->flow_refill_delay)) + f->credit = max_t(u32, f->credit, q->quantum); + } + + if (f->qlen == 0) + q->inactive_flows--; } + f->qlen++; /* Note: this overwrites f->age */ flow_queue_add(f, skb); - if (unlikely(f == &q->internal)) { - q->stat_internal_packets++; - } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -549,6 +599,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) skb = fq_peek(&q->internal); if (unlikely(skb)) { + q->internal.qlen--; fq_dequeue_skb(sch, &q->internal, skb); goto out; } @@ -592,6 +643,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) INET_ECN_set_ce(skb); q->stat_ce_mark++; } + if (--f->qlen == 0) + q->inactive_flows++; fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; @@ -1024,6 +1077,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.gc_flows = q->stat_gc_flows; st.highprio_packets = q->stat_internal_packets; + st.fastpath_packets = q->internal.stat_fastpath_packets; st.tcp_retrans = 0; st.throttled = q->stat_throttled; st.flows_plimit = q->stat_flows_plimit; From 8f6c4ff9e0522da9313fbff5295ae208af679fed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Sep 2023 20:17:15 +0000 Subject: [PATCH 5/5] net_sched: sch_fq: always garbage collect FQ performs garbage collection at enqueue time, and only if number of flows is above a given threshold, which is hit after the qdisc has been used a bit. Since an RB-tree traversal is needed to locate a flow, it makes sense to perform gc all the time, to keep rb-trees smaller. This reduces by 50 % average storage costs in FQ, and avoids 1 cache line miss at enqueue time when fast path added in prior patch can not be used. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 5cf3b50a24d58..681bbf34b7076 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -367,9 +367,7 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb) root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; - if (q->flows >= (2U << q->fq_trees_log) && - q->inactive_flows > q->flows/2) - fq_gc(q, root, sk); + fq_gc(q, root, sk); p = &root->rb_node; parent = NULL;