From dde0a648fc00e2156a3358600c5fbfb3f53256ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:18 -0700 Subject: [PATCH 1/5] net_sched: sch_fq: avoid touching f->next from fq_gc() A significant amount of cpu cycles is spent in fq_gc() When fq_gc() does its lookup in the rb-tree, it needs the following fields from struct fq_flow : f->sk (lookup key in the rb-tree) f->fq_node (anchor in the rb-tree) f->next (used to determine if the flow is detached) f->age (used to determine if the flow is candidate for gc) This unfortunately spans two cache lines (assuming 64 bytes cache lines) We can avoid using f->next, if we use the low order bit of f->{age|tail} This low order bit is 0, if f->tail points to an sk_buff. We set the low order bit to 1, if the union contains a jiffies value. Combined with the following patch, this makes sure we only need to bring into cpu caches one cache line per flow. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4c060134c7362..bc9ca1ba507b2 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -70,14 +70,14 @@ struct fq_flow { struct sk_buff *head; /* list of skbs for this flow : first skb */ union { struct sk_buff *tail; /* last skb in the list */ - unsigned long age; /* jiffies when flow was emptied, for gc */ + unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ }; struct rb_node fq_node; /* anchor in fq_root[] trees */ struct sock *sk; int qlen; /* number of packets in flow queue */ int credit; u32 socket_hash; /* sk_hash */ - struct fq_flow *next; /* next pointer in RR lists, or &detached */ + struct fq_flow *next; /* next pointer in RR lists */ struct rb_node rate_node; /* anchor in q->delayed tree */ u64 time_next_packet; @@ -126,20 +126,25 @@ struct fq_sched_data { struct qdisc_watchdog watchdog; }; -/* special value to mark a detached flow (not on old/new list) */ -static struct fq_flow detached, throttled; - +/* + * f->tail and f->age share the same location. + * We can use the low order bit to differentiate if this location points + * to a sk_buff or contains a jiffies value, if we force this value to be odd. + * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 + */ static void fq_flow_set_detached(struct fq_flow *f) { - f->next = &detached; - f->age = jiffies; + f->age = jiffies | 1UL; } static bool fq_flow_is_detached(const struct fq_flow *f) { - return f->next == &detached; + return !!(f->age & 1UL); } +/* special value to mark a throttled flow (not on old/new list) */ +static struct fq_flow throttled; + static bool fq_flow_is_throttled(const struct fq_flow *f) { return f->next == &throttled; From 7ba0537c2b534149be288f851900b4cf5aacde48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:19 -0700 Subject: [PATCH 2/5] net_sched: sch_fq: change fq_flow size/layout sizeof(struct fq_flow) is 112 bytes on 64bit arches. This means that half of them use two cache lines, but 50% use three cache lines. This patch adds cache line alignment, and makes sure that only the first cache line is touched by fq_enqueue(), which is more expensive that fq_dequeue() in general. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index bc9ca1ba507b2..ced1f987d7e4d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -66,6 +66,7 @@ static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) * in linear list (head,tail), otherwise are placed in a rbtree (t_root). */ struct fq_flow { +/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ struct rb_root t_root; struct sk_buff *head; /* list of skbs for this flow : first skb */ union { @@ -74,14 +75,18 @@ struct fq_flow { }; struct rb_node fq_node; /* anchor in fq_root[] trees */ struct sock *sk; + u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ + +/* Second cache line, used in fq_dequeue() */ int credit; - u32 socket_hash; /* sk_hash */ + /* 32bit hole on 64bit arches */ + struct fq_flow *next; /* next pointer in RR lists */ struct rb_node rate_node; /* anchor in q->delayed tree */ u64 time_next_packet; -}; +} ____cacheline_aligned_in_smp; struct fq_flow_head { struct fq_flow *first; From 82a0aa53b520edf50e08bad347d87d898de414eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:20 -0700 Subject: [PATCH 3/5] net_sched: sch_fq: use bulk freeing in fq_gc() fq_gc() already builds a small array of pointers, so using kmem_cache_free_bulk() needs very little change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index ced1f987d7e4d..53ec47ff84693 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -214,9 +214,10 @@ static void fq_gc(struct fq_sched_data *q, struct rb_root *root, struct sock *sk) { - struct fq_flow *f, *tofree[FQ_GC_MAX]; struct rb_node **p, *parent; - int fcnt = 0; + void *tofree[FQ_GC_MAX]; + struct fq_flow *f; + int i, fcnt = 0; p = &root->rb_node; parent = NULL; @@ -239,15 +240,18 @@ static void fq_gc(struct fq_sched_data *q, p = &parent->rb_left; } + if (!fcnt) + return; + + for (i = fcnt; i > 0; ) { + f = tofree[--i]; + rb_erase(&f->fq_node, root); + } q->flows -= fcnt; q->inactive_flows -= fcnt; q->stat_gc_flows += fcnt; - while (fcnt) { - struct fq_flow *f = tofree[--fcnt]; - rb_erase(&f->fq_node, root); - kmem_cache_free(fq_flow_cachep, f); - } + kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) From c288b0ca86a0a49ad450cf2d76a9a70c2ca9e43f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:21 -0700 Subject: [PATCH 4/5] net_sched: sch_fq: do not call fq_peek() twice per packet This refactors the code to not call fq_peek() from fq_dequeue_head() since the caller can provide the skb. Also rename fq_dequeue_head() to fq_dequeue_skb() because 'head' is a bit vague, given the skb could come from t_root rb-tree. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 53ec47ff84693..567df8fcaf703 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -384,19 +384,17 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, } } -/* remove one skb from head of flow queue */ -static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) +/* Remove one skb from flow queue. + * This skb must be the return value of prior fq_peek(). + */ +static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, + struct sk_buff *skb) { - struct sk_buff *skb = fq_peek(flow); - - if (skb) { - fq_erase_head(sch, flow, skb); - skb_mark_not_on_list(skb); - flow->qlen--; - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - } - return skb; + fq_erase_head(sch, flow, skb); + skb_mark_not_on_list(skb); + flow->qlen--; + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -508,9 +506,11 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) if (!sch->q.qlen) return NULL; - skb = fq_dequeue_head(sch, &q->internal); - if (skb) + skb = fq_peek(&q->internal); + if (unlikely(skb)) { + fq_dequeue_skb(sch, &q->internal, skb); goto out; + } now = ktime_get_ns(); fq_check_throttled(q, now); @@ -550,10 +550,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) INET_ECN_set_ce(skb); q->stat_ce_mark++; } - } - - skb = fq_dequeue_head(sch, f); - if (!skb) { + fq_dequeue_skb(sch, f, skb); + } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ if ((head == &q->new_flows) && q->old_flows.first) { From 348e289b0f23451d1c47b940e759f0f3a0c5756e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:22 -0700 Subject: [PATCH 5/5] net_sched: sch_fq: perform a prefetch() earlier The prefetch() done in fq_dequeue() can be done a bit earlier after the refactoring of the code done in the prior patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 567df8fcaf703..4f0104243cc23 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -546,6 +546,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) fq_flow_set_throttled(q, f); goto begin; } + prefetch(&skb->end); if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; @@ -562,7 +563,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) } goto begin; } - prefetch(&skb->end); plen = qdisc_pkt_len(skb); f->credit -= plen;