From ad7c5ebead13323ac4a45e01bda0609629523076 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 21 Mar 2024 13:17:33 +0000 Subject: [PATCH 01/15] mm/slub: remove dummy slabinfo functions The SLAB implementation has been removed since 6.8, so there is no other version of slabinfo_show_stats() and slabinfo_write(), then we can remove these two dummy functions. Signed-off-by: Xiu Jianfeng Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- mm/slab.h | 3 --- mm/slab_common.c | 2 -- mm/slub.c | 10 ---------- 3 files changed, 15 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index d2bc9b1912229..78e205b46e19a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -496,9 +496,6 @@ struct slabinfo { }; void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos); #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON diff --git a/mm/slab_common.c b/mm/slab_common.c index f5234672f03ce..67c03d6bd26cf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1078,7 +1078,6 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m) sinfo.limit, sinfo.batchcount, sinfo.shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); - slabinfo_show_stats(m, s); seq_putc(m, '\n'); } @@ -1155,7 +1154,6 @@ static const struct proc_ops slabinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = slabinfo_open, .proc_read = seq_read, - .proc_write = slabinfo_write, .proc_lseek = seq_lseek, .proc_release = seq_release, }; diff --git a/mm/slub.c b/mm/slub.c index 1bb2a93cf7b6a..cc7e68fbdbbac 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7099,14 +7099,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) -{ -} - -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - return -EIO; -} #endif /* CONFIG_SLUB_DEBUG */ From 87654cf7a9865c0be256d67229b7354125d7498e Mon Sep 17 00:00:00 2001 From: linke li Date: Thu, 21 Mar 2024 11:48:15 +0800 Subject: [PATCH 02/15] mm/slub: mark racy accesses on slab->slabs The reads of slab->slabs are racy because it may be changed by put_cpu_partial concurrently. In slabs_cpu_partial_show() and show_slab_objects(), slab->slabs is only used for showing information. Data-racy reads from shared variables that are used only for diagnostic purposes should typically use data_race(), since it is normally not a problem if the values are off by a little. This patch is aimed at reducing the number of benign races reported by KCSAN in order to focus future debugging effort on harmful races. Signed-off-by: linke li Reviewed-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index cc7e68fbdbbac..d35d918776250 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6036,7 +6036,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, else if (flags & SO_OBJECTS) WARN_ON_ONCE(1); else - x = slab->slabs; + x = data_race(slab->slabs); total += x; nodes[node] += x; } @@ -6241,7 +6241,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) - slabs += slab->slabs; + slabs += data_race(slab->slabs); } #endif @@ -6255,7 +6255,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) { - slabs = READ_ONCE(slab->slabs); + slabs = data_race(slab->slabs); objects = (slabs * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", cpu, objects, slabs); From 9198ffbd2b494daae3a67cac1d59c3a2754e64cd Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Sat, 30 Mar 2024 16:23:35 +0800 Subject: [PATCH 03/15] mm/slub: Reduce memory consumption in extreme scenarios When kmalloc_node() is called without __GFP_THISNODE and the target node lacks sufficient memory, SLUB allocates a folio from a different node other than the requested node, instead of taking a partial slab from it. However, since the allocated folio does not belong to the requested node, on the following allocation it is deactivated and added to the partial slab list of the node it belongs to. This behavior can result in excessive memory usage when the requested node has insufficient memory, as SLUB will repeatedly allocate folios from other nodes without reusing the previously allocated ones. To prevent memory wastage, when a preferred node is indicated (not NUMA_NO_NODE) but without a prior __GFP_THISNODE constraint: 1) try to get a partial slab from target node only by having __GFP_THISNODE in pc.flags for get_partial() 2) if 1) failed, try to allocate a new slab from target node with GFP_NOWAIT | __GFP_THISNODE opportunistically. 3) if 2) failed, retry with original gfpflags which will allow get_partial() try partial lists of other nodes before potentially allocating new page from other nodes Without a preferred node, or with __GFP_THISNODE constraint, the behavior remains unchanged. On qemu with 4 numa nodes and each numa has 1G memory. Write a test ko to call kmalloc_node(196, GFP_KERNEL, 3) for (4 * 1024 + 4) * 1024 times. cat /proc/slabinfo shows: kmalloc-256 4200530 13519712 256 32 2 : tunables.. after this patch, cat /proc/slabinfo shows: kmalloc-256 4200558 4200768 256 32 2 : tunables.. Signed-off-by: Chen Jun Signed-off-by: Kefeng Wang Signed-off-by: Vlastimil Babka --- mm/slub.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d35d918776250..f152b5df8ab2e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2699,7 +2699,7 @@ static struct slab *get_partial(struct kmem_cache *s, int node, searchnode = numa_mem_id(); slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || node != NUMA_NO_NODE) + if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) return slab; return get_any_partial(s, pc); @@ -3375,6 +3375,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, struct slab *slab; unsigned long flags; struct partial_context pc; + bool try_thisnode = true; stat(s, ALLOC_SLOWPATH); @@ -3501,6 +3502,21 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, new_objects: pc.flags = gfpflags; + /* + * When a preferred node is indicated but no __GFP_THISNODE + * + * 1) try to get a partial slab from target node only by having + * __GFP_THISNODE in pc.flags for get_partial() + * 2) if 1) failed, try to allocate a new slab from target node with + * GPF_NOWAIT | __GFP_THISNODE opportunistically + * 3) if 2) failed, retry with original gfpflags which will allow + * get_partial() try partial lists of other nodes before potentially + * allocating new page from other nodes + */ + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode)) + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { @@ -3522,10 +3538,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, } slub_put_cpu_ptr(s->cpu_slab); - slab = new_slab(s, gfpflags, node); + slab = new_slab(s, pc.flags, node); c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode) { + try_thisnode = false; + goto new_objects; + } slab_out_of_memory(s, gfpflags, node); return NULL; } From acc8f4dbf1b3293dc0b5a51eadde086123806c0f Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Thu, 4 Apr 2024 13:58:24 +0800 Subject: [PATCH 04/15] mm/slub: remove the check of !kmem_cache_has_cpu_partial() The check of !kmem_cache_has_cpu_partial(s) with CONFIG_SLUB_CPU_PARTIAL enabled here is always false. We have already checked kmem_cache_debug() earlier and if it was true, then we either continued or broke from the loop so we can't reach this code in that case and don't need to check kmem_cache_debug() as part of kmem_cache_has_cpu_partial() again. Here we can remove it. Signed-off-by: Xiongwei Song Signed-off-by: Vlastimil Babka --- mm/slub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f152b5df8ab2e..2e0351066c5d8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2610,8 +2610,7 @@ static struct slab *get_partial_node(struct kmem_cache *s, partial_slabs++; } #ifdef CONFIG_SLUB_CPU_PARTIAL - if (!kmem_cache_has_cpu_partial(s) - || partial_slabs > s->cpu_partial_slabs / 2) + if (partial_slabs > s->cpu_partial_slabs / 2) break; #else break; From 721a2f8be134f9bb61f4358cbb7ae394eaf74573 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Thu, 4 Apr 2024 13:58:25 +0800 Subject: [PATCH 05/15] mm/slub: add slub_get_cpu_partial() helper Add slub_get_cpu_partial() and dummy function to help improve get_partial_node(). It can help remove #ifdef of CONFIG_SLUB_CPU_PARTIAL and improve filling cpu partial logic. Signed-off-by: Xiongwei Song Signed-off-by: Vlastimil Babka --- mm/slub.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 2e0351066c5d8..936f2b13a78e1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -604,11 +604,21 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); s->cpu_partial_slabs = nr_slabs; } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return s->cpu_partial_slabs; +} #else static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return 0; +} #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* From ff99b18fee793826dd5604da72d6259a531b45e9 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Thu, 4 Apr 2024 13:58:26 +0800 Subject: [PATCH 06/15] mm/slub: simplify get_partial_node() The break conditions for filling cpu partial can be more readable and simple. If slub_get_cpu_partial() returns 0, we can confirm that we don't need to fill cpu partial, then we should break from the loop. On the other hand, we also should break from the loop if we have added enough cpu partial slabs. Meanwhile, the logic above gets rid of the #ifdef and also fixes a weird corner case that if we set cpu_partial_slabs to 0 from sysfs, we still allocate at least one here. Signed-off-by: Xiongwei Song Signed-off-by: Vlastimil Babka --- mm/slub.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 936f2b13a78e1..a9b1337e81c2f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2614,18 +2614,18 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!partial) { partial = slab; stat(s, ALLOC_FROM_PARTIAL); + + if ((slub_get_cpu_partial(s) == 0)) { + break; + } } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); - partial_slabs++; - } -#ifdef CONFIG_SLUB_CPU_PARTIAL - if (partial_slabs > s->cpu_partial_slabs / 2) - break; -#else - break; -#endif + if (++partial_slabs > slub_get_cpu_partial(s) / 2) { + break; + } + } } spin_unlock_irqrestore(&n->list_lock, flags); return partial; From b1080c667b3b2c8c38a7fa83ca5567124887abae Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 2 Apr 2024 06:38:39 -0700 Subject: [PATCH 07/15] mm/slub, kunit: Use inverted data to corrupt kmem cache Two failure patterns are seen randomly when running slub_kunit tests with CONFIG_SLAB_FREELIST_RANDOM and CONFIG_SLAB_FREELIST_HARDENED enabled. Pattern 1: # test_clobber_zone: pass:1 fail:0 skip:0 total:1 ok 1 test_clobber_zone # test_next_pointer: EXPECTATION FAILED at lib/slub_kunit.c:72 Expected 3 == slab_errors, but slab_errors == 0 (0x0) # test_next_pointer: EXPECTATION FAILED at lib/slub_kunit.c:84 Expected 2 == slab_errors, but slab_errors == 0 (0x0) # test_next_pointer: pass:0 fail:1 skip:0 total:1 not ok 2 test_next_pointer In this case, test_next_pointer() overwrites p[s->offset], but the data at p[s->offset] is already 0x12. Pattern 2: ok 1 test_clobber_zone # test_next_pointer: EXPECTATION FAILED at lib/slub_kunit.c:72 Expected 3 == slab_errors, but slab_errors == 2 (0x2) # test_next_pointer: pass:0 fail:1 skip:0 total:1 not ok 2 test_next_pointer In this case, p[s->offset] has a value other than 0x12, but one of the expected failures is nevertheless missing. Invert data instead of writing a fixed value to corrupt the cache data structures to fix the problem. Fixes: 1f9f78b1b376 ("mm/slub, kunit: add a KUnit test for SLUB debugging functionality") Cc: Oliver Glitta Cc: Vlastimil Babka CC: Daniel Latypov Cc: Marco Elver Signed-off-by: Guenter Roeck Signed-off-by: Vlastimil Babka --- lib/slub_kunit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index d4a3730b08fa7..4ce9604388069 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -55,7 +55,7 @@ static void test_next_pointer(struct kunit *test) ptr_addr = (unsigned long *)(p + s->offset); tmp = *ptr_addr; - p[s->offset] = 0x12; + p[s->offset] = ~p[s->offset]; /* * Expecting three errors. From b062539c4e2f31ff346795ce32012e9e6300d212 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 25 Mar 2024 12:22:17 +0000 Subject: [PATCH 08/15] mm/slub: correct comment in do_slab_free() slab_alloc_node() should be __slab_alloc_node(). Signed-off-by: Xiu Jianfeng Signed-off-by: Vlastimil Babka --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index a9b1337e81c2f..f4fd460854fd5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4256,7 +4256,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); - /* Same with comment on barrier() in slab_alloc_node() */ + /* Same with comment on barrier() in __slab_alloc_node() */ barrier(); if (unlikely(slab != c->slab)) { From 5aa5c7b9a09dfce2761c46579cc421708492e890 Mon Sep 17 00:00:00 2001 From: Sangyun Kim Date: Sat, 6 Apr 2024 16:44:49 +0900 Subject: [PATCH 09/15] mm/slub: remove duplicate initialization for early_kmem_cache_node_alloc() The struct track for every object in a new slab is already set up by new_slab(), so remove the duplicate initialization in early_kmem_cache_node_alloc(). Co-developed-by: Hyunmin Lee Signed-off-by: Hyunmin Lee Co-developed-by: Jeungwoo Yoo Signed-off-by: Jeungwoo Yoo Signed-off-by: Sangyun Kim Cc: Gwan-gyeong Mun Signed-off-by: Vlastimil Babka --- mm/slub.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index f4fd460854fd5..a30bd2157c07a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4877,7 +4877,6 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); #endif n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); From 5b15f3fb89fc23b52c3cf33e76a1ada83108b438 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Sat, 13 Apr 2024 15:56:03 +0000 Subject: [PATCH 10/15] slub: Set __GFP_COMP in kmem_cache by default Now the __GFP_COMP is set only if the higher-order is not 0. However, __GFP_COMP flag can be set unconditionally because compound page can not be created in the order-0 case. And this can also simplify the code a bit (no need to check the order is 0 or not). Signed-off-by: Haifeng Xu Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Vlastimil Babka --- mm/slub.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a30bd2157c07a..62a0bb2c3147d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5089,9 +5089,7 @@ static int calculate_sizes(struct kmem_cache *s) if ((int)order < 0) return 0; - s->allocflags = 0; - if (order) - s->allocflags |= __GFP_COMP; + s->allocflags = __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; From 046f4c69090c120a51aa4767628afa900aac8e28 Mon Sep 17 00:00:00 2001 From: Jianfeng Wang Date: Mon, 22 Apr 2024 21:55:53 -0700 Subject: [PATCH 11/15] slub: introduce count_partial_free_approx() When reading "/proc/slabinfo", the kernel needs to report the number of free objects for each kmem_cache. The current implementation uses count_partial() to get it by scanning each kmem_cache_node's partial slab list and summing free objects from every partial slab. This process must hold per-kmem_cache_node spinlock and disable IRQ, and may take a long time. Consequently, it can block slab allocations on other CPUs and cause timeouts for network devices, when the partial list is long. In production, even NMI watchdog can be triggered due to this matter: e.g., for "buffer_head", the number of partial slabs was observed to be ~1M in one kmem_cache_node. This problem was also confirmed by others [1-3]. Iterating a partial list to get the exact count of objects can cause soft lockups for a long list with or without the lock (e.g., if preemption is disabled), and may not be very useful: the object count can change after the lock is released. The approach of maintaining free-object counters requires atomic operations on the fast path [3]. So, the fix is to introduce count_partial_free_approx(). This function can be used for getting the free object count in a kmem_cache_node's partial list. It limits the number of slabs to scan and avoids scanning the whole list by giving an approximation for a long list. Suppose the limit is N. If the list's length is not greater than N, output the exact count by traversing the list; if its length is greater than N, output an approximated count by traversing a subset of the list. The proposed method is to scan N/2 slabs from the list's head and N/2 slabs from the tail. For a partial list with ~280K slabs, benchmarks show that it performs better than just counting from the list's head, after slabs get sorted by kmem_cache_shrink(). Default the limit to 10000, as it produces an approximation within 1% of the exact count for both scenarios. Then, use count_partial_free_approx() in get_slabinfo(). Benchmarks: Diff = (exact - approximated) / exact * Normal case (w/o kmem_cache_shrink()): | MAX_TO_SCAN | Diff (count from head)| Diff (count head+tail)| | 1000 | 0.43 % | 1.09 % | | 5000 | 0.06 % | 0.37 % | | 10000 | 0.02 % | 0.16 % | | 20000 | 0.009 % | -0.003 % | * Skewed case (w/ kmem_cache_shrink()): | MAX_TO_SCAN | Diff (count from head)| Diff (count head+tail)| | 1000 | 12.46 % | 6.75 % | | 5000 | 5.38 % | 1.27 % | | 10000 | 4.99 % | 0.22 % | | 20000 | 4.86 % | -0.06 % | [1] https://lore.kernel.org/linux-mm/alpine.DEB.2.21.2003031602460.1537@www.lameter.com/T/ [2] https://lore.kernel.org/lkml/alpine.DEB.2.22.394.2008071258020.55871@www.lameter.com/T/ [3] https://lore.kernel.org/lkml/1e01092b-140d-2bab-aeba-321a74a194ee@linux.com/T/ Signed-off-by: Jianfeng Wang Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- mm/slub.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 62a0bb2c3147d..7e2a208134c31 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3238,6 +3238,43 @@ static unsigned long count_partial(struct kmem_cache_node *n, #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ #ifdef CONFIG_SLUB_DEBUG +#define MAX_PARTIAL_TO_SCAN 10000 + +static unsigned long count_partial_free_approx(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct slab *slab; + + spin_lock_irqsave(&n->list_lock, flags); + if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) { + list_for_each_entry(slab, &n->partial, slab_list) + x += slab->objects - slab->inuse; + } else { + /* + * For a long list, approximate the total count of objects in + * it to meet the limit on the number of slabs to scan. + * Scan from both the list's head and tail for better accuracy. + */ + unsigned long scanned = 0; + + list_for_each_entry(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN / 2) + break; + } + list_for_each_entry_reverse(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN) + break; + } + x = mult_frac(x, n->nr_partial, scanned); + x = min(x, node_nr_objs(n)); + } + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { @@ -7116,7 +7153,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); - nr_free += count_partial(n, count_free); + nr_free += count_partial_free_approx(n); } sinfo->active_objs = nr_objs - nr_free; From b3d8a8e870144369fdbcbb1a78878ce98532265a Mon Sep 17 00:00:00 2001 From: Jianfeng Wang Date: Mon, 22 Apr 2024 21:55:54 -0700 Subject: [PATCH 12/15] slub: use count_partial_free_approx() in slab_out_of_memory() slab_out_of_memory() uses count_partial() to get the exact count of free objects for each node. As it may get called in the slab allocation path, count_partial_free_approx() can be used to avoid the risk and overhead of traversing a long partial slab list. At the same time, show_slab_objects() still uses count_partial(). Thus, slub users can still have the option to access the exact count of objects via sysfs if the overhead is acceptable to them. Signed-off-by: Jianfeng Wang Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 7e2a208134c31..3aa12b9b323d9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3301,7 +3301,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) unsigned long nr_objs; unsigned long nr_free; - nr_free = count_partial(n, count_free); + nr_free = count_partial_free_approx(n); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); From 844776cb65a77ef27bfba2220e285940b714ae4e Mon Sep 17 00:00:00 2001 From: linke li Date: Sat, 27 Apr 2024 16:51:54 +0800 Subject: [PATCH 13/15] mm/slub: mark racy access on slab->freelist In deactivate_slab(), slab->freelist can be changed concurrently. Mark data race on slab->freelist as benign using READ_ONCE. This patch is aimed at reducing the number of benign races reported by KCSAN in order to focus future debugging effort on harmful races. Signed-off-by: linke li Signed-off-by: Vlastimil Babka --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 3aa12b9b323d9..88d3dcdfa45d8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2806,7 +2806,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, struct slab new; struct slab old; - if (slab->freelist) { + if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } From 306c4ac9896b07b8872293eb224058ff83f81fac Mon Sep 17 00:00:00 2001 From: Hyunmin Lee Date: Wed, 24 Apr 2024 23:04:21 +0900 Subject: [PATCH 14/15] mm/slub: create kmalloc 96 and 192 caches regardless cache size order For SLAB the kmalloc caches needed to be created in ascending sizes in order. However, the constraint is not necessary anymore because SLAB has been removed and SLUB doesn't need to comply with the constraint. Thus, kmalloc 96 and 192 caches can be created after the other size kmalloc caches are created instead of checking every time to find their order to be created. Also, this change could prevent engineers from being confused by the removed constraint. Signed-off-by: Hyunmin Lee Co-developed-by: Jeungwoo Yoo Signed-off-by: Jeungwoo Yoo Co-developed-by: Sangyun Kim Signed-off-by: Sangyun Kim Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Gwan-gyeong Mun Reviewed-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 67c03d6bd26cf..7cfdcc8cbf5f8 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -916,21 +916,16 @@ void __init create_kmalloc_caches(void) * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined */ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { + /* Caches that are NOT of the two-to-the-power-of size. */ + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[type][1]) + new_kmalloc_cache(1, type); + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[type][2]) + new_kmalloc_cache(2, type); + + /* Caches that are of the two-to-the-power-of size. */ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[type][i]) new_kmalloc_cache(i, type); - - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && i == 6 && - !kmalloc_caches[type][1]) - new_kmalloc_cache(1, type); - if (KMALLOC_MIN_SIZE <= 64 && i == 7 && - !kmalloc_caches[type][2]) - new_kmalloc_cache(2, type); } } #ifdef CONFIG_RANDOM_KMALLOC_CACHES From 7338999ca3468404f547b1540211114cbdb26d06 Mon Sep 17 00:00:00 2001 From: Hyunmin Lee Date: Wed, 24 Apr 2024 23:04:22 +0900 Subject: [PATCH 15/15] mm/slub: remove the check for NULL kmalloc_caches If the same size kmalloc cache already exists, it should not be created again. So there is the check for NULL kmalloc_caches before calling the kmalloc creation function. However, new_kmalloc_cache() itself checks NULL kmalloc_cahces before cache creation. Therefore, the NULL check is not necessary in this function. Signed-off-by: Hyunmin Lee Co-developed-by: Jeungwoo Yoo Signed-off-by: Jeungwoo Yoo Co-developed-by: Sangyun Kim Signed-off-by: Sangyun Kim Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Gwan-gyeong Mun Reviewed-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 7cfdcc8cbf5f8..c37f8c41ffb00 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -917,16 +917,14 @@ void __init create_kmalloc_caches(void) */ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { /* Caches that are NOT of the two-to-the-power-of size. */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[type][1]) + if (KMALLOC_MIN_SIZE <= 32) new_kmalloc_cache(1, type); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[type][2]) + if (KMALLOC_MIN_SIZE <= 64) new_kmalloc_cache(2, type); /* Caches that are of the two-to-the-power-of size. */ - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[type][i]) - new_kmalloc_cache(i, type); - } + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + new_kmalloc_cache(i, type); } #ifdef CONFIG_RANDOM_KMALLOC_CACHES random_kmalloc_seed = get_random_u64();