From ad7c5ebead13323ac4a45e01bda0609629523076 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Thu, 21 Mar 2024 13:17:33 +0000
Subject: [PATCH 01/15] mm/slub: remove dummy slabinfo functions

The SLAB implementation has been removed since 6.8, so there is no
other version of slabinfo_show_stats() and slabinfo_write(), then we
can remove these two dummy functions.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h        |  3 ---
 mm/slab_common.c |  2 --
 mm/slub.c        | 10 ----------
 3 files changed, 15 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index d2bc9b1912229..78e205b46e19a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -496,9 +496,6 @@ struct slabinfo {
 };
 
 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
-void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *ppos);
 
 #ifdef CONFIG_SLUB_DEBUG
 #ifdef CONFIG_SLUB_DEBUG_ON
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f5234672f03ce..67c03d6bd26cf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1078,7 +1078,6 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
 		   sinfo.limit, sinfo.batchcount, sinfo.shared);
 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
 		   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
-	slabinfo_show_stats(m, s);
 	seq_putc(m, '\n');
 }
 
@@ -1155,7 +1154,6 @@ static const struct proc_ops slabinfo_proc_ops = {
 	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_open	= slabinfo_open,
 	.proc_read	= seq_read,
-	.proc_write	= slabinfo_write,
 	.proc_lseek	= seq_lseek,
 	.proc_release	= seq_release,
 };
diff --git a/mm/slub.c b/mm/slub.c
index 1bb2a93cf7b6a..cc7e68fbdbbac 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -7099,14 +7099,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
 	sinfo->objects_per_slab = oo_objects(s->oo);
 	sinfo->cache_order = oo_order(s->oo);
 }
-
-void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
-{
-}
-
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *ppos)
-{
-	return -EIO;
-}
 #endif /* CONFIG_SLUB_DEBUG */

From 87654cf7a9865c0be256d67229b7354125d7498e Mon Sep 17 00:00:00 2001
From: linke li <lilinke99@qq.com>
Date: Thu, 21 Mar 2024 11:48:15 +0800
Subject: [PATCH 02/15] mm/slub: mark racy accesses on slab->slabs

The reads of slab->slabs are racy because it may be changed by
put_cpu_partial concurrently. In slabs_cpu_partial_show() and
show_slab_objects(), slab->slabs is only used for showing information.

Data-racy reads from shared variables that are used only for diagnostic
purposes should typically use data_race(), since it is normally not a
problem if the values are off by a little.

This patch is aimed at reducing the number of benign races reported by
KCSAN in order to focus future debugging effort on harmful races.

Signed-off-by: linke li <lilinke99@qq.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index cc7e68fbdbbac..d35d918776250 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -6036,7 +6036,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 				else if (flags & SO_OBJECTS)
 					WARN_ON_ONCE(1);
 				else
-					x = slab->slabs;
+					x = data_race(slab->slabs);
 				total += x;
 				nodes[node] += x;
 			}
@@ -6241,7 +6241,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 
 		if (slab)
-			slabs += slab->slabs;
+			slabs += data_race(slab->slabs);
 	}
 #endif
 
@@ -6255,7 +6255,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 
 		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 		if (slab) {
-			slabs = READ_ONCE(slab->slabs);
+			slabs = data_race(slab->slabs);
 			objects = (slabs * oo_objects(s->oo)) / 2;
 			len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
 					     cpu, objects, slabs);

From 9198ffbd2b494daae3a67cac1d59c3a2754e64cd Mon Sep 17 00:00:00 2001
From: Chen Jun <chenjun102@huawei.com>
Date: Sat, 30 Mar 2024 16:23:35 +0800
Subject: [PATCH 03/15] mm/slub: Reduce memory consumption in extreme scenarios

When kmalloc_node() is called without __GFP_THISNODE and the target node
lacks sufficient memory, SLUB allocates a folio from a different node
other than the requested node, instead of taking a partial slab from it.

However, since the allocated folio does not belong to the requested
node, on the following allocation it is deactivated and added to the
partial slab list of the node it belongs to.

This behavior can result in excessive memory usage when the requested
node has insufficient memory, as SLUB will repeatedly allocate folios
from other nodes without reusing the previously allocated ones.

To prevent memory wastage, when a preferred node is indicated (not
NUMA_NO_NODE) but without a prior __GFP_THISNODE constraint:

1) try to get a partial slab from target node only by having
   __GFP_THISNODE in pc.flags for get_partial()
2) if 1) failed, try to allocate a new slab from target node with
   GFP_NOWAIT | __GFP_THISNODE opportunistically.
3) if 2) failed, retry with original gfpflags which will allow
   get_partial() try partial lists of other nodes before potentially
   allocating new page from other nodes

Without a preferred node, or with __GFP_THISNODE constraint, the
behavior remains unchanged.

On qemu with 4 numa nodes and each numa has 1G memory. Write a test ko
to call kmalloc_node(196, GFP_KERNEL, 3) for (4 * 1024 + 4) * 1024 times.

cat /proc/slabinfo shows:
kmalloc-256       4200530 13519712    256   32    2 : tunables..

after this patch,
cat /proc/slabinfo shows:
kmalloc-256       4200558 4200768    256   32    2 : tunables..

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index d35d918776250..f152b5df8ab2e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2699,7 +2699,7 @@ static struct slab *get_partial(struct kmem_cache *s, int node,
 		searchnode = numa_mem_id();
 
 	slab = get_partial_node(s, get_node(s, searchnode), pc);
-	if (slab || node != NUMA_NO_NODE)
+	if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
 		return slab;
 
 	return get_any_partial(s, pc);
@@ -3375,6 +3375,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	struct slab *slab;
 	unsigned long flags;
 	struct partial_context pc;
+	bool try_thisnode = true;
 
 	stat(s, ALLOC_SLOWPATH);
 
@@ -3501,6 +3502,21 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 new_objects:
 
 	pc.flags = gfpflags;
+	/*
+	 * When a preferred node is indicated but no __GFP_THISNODE
+	 *
+	 * 1) try to get a partial slab from target node only by having
+	 *    __GFP_THISNODE in pc.flags for get_partial()
+	 * 2) if 1) failed, try to allocate a new slab from target node with
+	 *    GPF_NOWAIT | __GFP_THISNODE opportunistically
+	 * 3) if 2) failed, retry with original gfpflags which will allow
+	 *    get_partial() try partial lists of other nodes before potentially
+	 *    allocating new page from other nodes
+	 */
+	if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
+		     && try_thisnode))
+		pc.flags = GFP_NOWAIT | __GFP_THISNODE;
+
 	pc.orig_size = orig_size;
 	slab = get_partial(s, node, &pc);
 	if (slab) {
@@ -3522,10 +3538,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	}
 
 	slub_put_cpu_ptr(s->cpu_slab);
-	slab = new_slab(s, gfpflags, node);
+	slab = new_slab(s, pc.flags, node);
 	c = slub_get_cpu_ptr(s->cpu_slab);
 
 	if (unlikely(!slab)) {
+		if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
+		    && try_thisnode) {
+			try_thisnode = false;
+			goto new_objects;
+		}
 		slab_out_of_memory(s, gfpflags, node);
 		return NULL;
 	}

From acc8f4dbf1b3293dc0b5a51eadde086123806c0f Mon Sep 17 00:00:00 2001
From: Xiongwei Song <xiongwei.song@windriver.com>
Date: Thu, 4 Apr 2024 13:58:24 +0800
Subject: [PATCH 04/15] mm/slub: remove the check of
 !kmem_cache_has_cpu_partial()

The check of !kmem_cache_has_cpu_partial(s) with
CONFIG_SLUB_CPU_PARTIAL enabled here is always false.

We have already checked kmem_cache_debug() earlier and if it was true,
then we either continued or broke from the loop so we can't reach this
code in that case and don't need to check kmem_cache_debug() as part of
kmem_cache_has_cpu_partial() again. Here we can remove it.

Signed-off-by: Xiongwei Song <xiongwei.song@windriver.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index f152b5df8ab2e..2e0351066c5d8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2610,8 +2610,7 @@ static struct slab *get_partial_node(struct kmem_cache *s,
 			partial_slabs++;
 		}
 #ifdef CONFIG_SLUB_CPU_PARTIAL
-		if (!kmem_cache_has_cpu_partial(s)
-			|| partial_slabs > s->cpu_partial_slabs / 2)
+		if (partial_slabs > s->cpu_partial_slabs / 2)
 			break;
 #else
 		break;

From 721a2f8be134f9bb61f4358cbb7ae394eaf74573 Mon Sep 17 00:00:00 2001
From: Xiongwei Song <xiongwei.song@windriver.com>
Date: Thu, 4 Apr 2024 13:58:25 +0800
Subject: [PATCH 05/15] mm/slub: add slub_get_cpu_partial() helper

Add slub_get_cpu_partial() and dummy function to help improve
get_partial_node(). It can help remove #ifdef of CONFIG_SLUB_CPU_PARTIAL
and improve filling cpu partial logic.

Signed-off-by: Xiongwei Song <xiongwei.song@windriver.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mm/slub.c b/mm/slub.c
index 2e0351066c5d8..936f2b13a78e1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -604,11 +604,21 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
 	nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
 	s->cpu_partial_slabs = nr_slabs;
 }
+
+static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
+{
+	return s->cpu_partial_slabs;
+}
 #else
 static inline void
 slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
 {
 }
+
+static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
+{
+	return 0;
+}
 #endif /* CONFIG_SLUB_CPU_PARTIAL */
 
 /*

From ff99b18fee793826dd5604da72d6259a531b45e9 Mon Sep 17 00:00:00 2001
From: Xiongwei Song <xiongwei.song@windriver.com>
Date: Thu, 4 Apr 2024 13:58:26 +0800
Subject: [PATCH 06/15] mm/slub: simplify get_partial_node()

The break conditions for filling cpu partial can be more readable and
simple.

If slub_get_cpu_partial() returns 0, we can confirm that we don't need
to fill cpu partial, then we should break from the loop. On the other
hand, we also should break from the loop if we have added enough cpu
partial slabs.

Meanwhile, the logic above gets rid of the #ifdef and also fixes a weird
corner case that if we set cpu_partial_slabs to 0 from sysfs, we still
allocate at least one here.

Signed-off-by: Xiongwei Song <xiongwei.song@windriver.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 936f2b13a78e1..a9b1337e81c2f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2614,18 +2614,18 @@ static struct slab *get_partial_node(struct kmem_cache *s,
 		if (!partial) {
 			partial = slab;
 			stat(s, ALLOC_FROM_PARTIAL);
+
+			if ((slub_get_cpu_partial(s) == 0)) {
+				break;
+			}
 		} else {
 			put_cpu_partial(s, slab, 0);
 			stat(s, CPU_PARTIAL_NODE);
-			partial_slabs++;
-		}
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-		if (partial_slabs > s->cpu_partial_slabs / 2)
-			break;
-#else
-		break;
-#endif
 
+			if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
+				break;
+			}
+		}
 	}
 	spin_unlock_irqrestore(&n->list_lock, flags);
 	return partial;

From b1080c667b3b2c8c38a7fa83ca5567124887abae Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 2 Apr 2024 06:38:39 -0700
Subject: [PATCH 07/15] mm/slub, kunit: Use inverted data to corrupt kmem cache

Two failure patterns are seen randomly when running slub_kunit tests with
CONFIG_SLAB_FREELIST_RANDOM and CONFIG_SLAB_FREELIST_HARDENED enabled.

Pattern 1:
     # test_clobber_zone: pass:1 fail:0 skip:0 total:1
     ok 1 test_clobber_zone
     # test_next_pointer: EXPECTATION FAILED at lib/slub_kunit.c:72
     Expected 3 == slab_errors, but
         slab_errors == 0 (0x0)
     # test_next_pointer: EXPECTATION FAILED at lib/slub_kunit.c:84
     Expected 2 == slab_errors, but
         slab_errors == 0 (0x0)
     # test_next_pointer: pass:0 fail:1 skip:0 total:1
     not ok 2 test_next_pointer

In this case, test_next_pointer() overwrites p[s->offset], but the data
at p[s->offset] is already 0x12.

Pattern 2:
     ok 1 test_clobber_zone
     # test_next_pointer: EXPECTATION FAILED at lib/slub_kunit.c:72
     Expected 3 == slab_errors, but
         slab_errors == 2 (0x2)
     # test_next_pointer: pass:0 fail:1 skip:0 total:1
     not ok 2 test_next_pointer

In this case, p[s->offset] has a value other than 0x12, but one of the
expected failures is nevertheless missing.

Invert data instead of writing a fixed value to corrupt the cache data
structures to fix the problem.

Fixes: 1f9f78b1b376 ("mm/slub, kunit: add a KUnit test for SLUB debugging functionality")
Cc: Oliver Glitta <glittao@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
CC: Daniel Latypov <dlatypov@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 lib/slub_kunit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c
index d4a3730b08fa7..4ce9604388069 100644
--- a/lib/slub_kunit.c
+++ b/lib/slub_kunit.c
@@ -55,7 +55,7 @@ static void test_next_pointer(struct kunit *test)
 
 	ptr_addr = (unsigned long *)(p + s->offset);
 	tmp = *ptr_addr;
-	p[s->offset] = 0x12;
+	p[s->offset] = ~p[s->offset];
 
 	/*
 	 * Expecting three errors.

From b062539c4e2f31ff346795ce32012e9e6300d212 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Mon, 25 Mar 2024 12:22:17 +0000
Subject: [PATCH 08/15] mm/slub: correct comment in do_slab_free()

slab_alloc_node() should be __slab_alloc_node().

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index a9b1337e81c2f..f4fd460854fd5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4256,7 +4256,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
 	c = raw_cpu_ptr(s->cpu_slab);
 	tid = READ_ONCE(c->tid);
 
-	/* Same with comment on barrier() in slab_alloc_node() */
+	/* Same with comment on barrier() in __slab_alloc_node() */
 	barrier();
 
 	if (unlikely(slab != c->slab)) {

From 5aa5c7b9a09dfce2761c46579cc421708492e890 Mon Sep 17 00:00:00 2001
From: Sangyun Kim <sangyun.kim@snu.ac.kr>
Date: Sat, 6 Apr 2024 16:44:49 +0900
Subject: [PATCH 09/15] mm/slub: remove duplicate initialization for
 early_kmem_cache_node_alloc()

The struct track for every object in a new slab is already set up by
new_slab(), so remove the duplicate initialization in
early_kmem_cache_node_alloc().

Co-developed-by: Hyunmin Lee <hyunminlr@gmail.com>
Signed-off-by: Hyunmin Lee <hyunminlr@gmail.com>
Co-developed-by: Jeungwoo Yoo <casionwoo@gmail.com>
Signed-off-by: Jeungwoo Yoo <casionwoo@gmail.com>
Signed-off-by: Sangyun Kim <sangyun.kim@snu.ac.kr>
Cc: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index f4fd460854fd5..a30bd2157c07a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4877,7 +4877,6 @@ static void early_kmem_cache_node_alloc(int node)
 	BUG_ON(!n);
 #ifdef CONFIG_SLUB_DEBUG
 	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
-	init_tracking(kmem_cache_node, n);
 #endif
 	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
 	slab->freelist = get_freepointer(kmem_cache_node, n);

From 5b15f3fb89fc23b52c3cf33e76a1ada83108b438 Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Sat, 13 Apr 2024 15:56:03 +0000
Subject: [PATCH 10/15] slub: Set __GFP_COMP in kmem_cache by default

Now the __GFP_COMP is set only if the higher-order is not 0. However,
__GFP_COMP flag can be set unconditionally because compound page can
not be created in the order-0 case. And this can also simplify the code
a bit (no need to check the order is 0 or not).

Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index a30bd2157c07a..62a0bb2c3147d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5089,9 +5089,7 @@ static int calculate_sizes(struct kmem_cache *s)
 	if ((int)order < 0)
 		return 0;
 
-	s->allocflags = 0;
-	if (order)
-		s->allocflags |= __GFP_COMP;
+	s->allocflags = __GFP_COMP;
 
 	if (s->flags & SLAB_CACHE_DMA)
 		s->allocflags |= GFP_DMA;

From 046f4c69090c120a51aa4767628afa900aac8e28 Mon Sep 17 00:00:00 2001
From: Jianfeng Wang <jianfeng.w.wang@oracle.com>
Date: Mon, 22 Apr 2024 21:55:53 -0700
Subject: [PATCH 11/15] slub: introduce count_partial_free_approx()

When reading "/proc/slabinfo", the kernel needs to report the number
of free objects for each kmem_cache. The current implementation uses
count_partial() to get it by scanning each kmem_cache_node's partial
slab list and summing free objects from every partial slab. This
process must hold per-kmem_cache_node spinlock and disable IRQ, and
may take a long time. Consequently, it can block slab allocations on
other CPUs and cause timeouts for network devices, when the partial
list is long. In production, even NMI watchdog can be triggered due
to this matter: e.g., for "buffer_head", the number of partial slabs
was observed to be ~1M in one kmem_cache_node. This problem was also
confirmed by others [1-3].

Iterating a partial list to get the exact count of objects can cause
soft lockups for a long list with or without the lock (e.g., if
preemption is disabled), and may not be very useful: the object count
can change after the lock is released. The approach of maintaining
free-object counters requires atomic operations on the fast path [3].

So, the fix is to introduce count_partial_free_approx(). This function
can be used for getting the free object count in a kmem_cache_node's
partial list. It limits the number of slabs to scan and avoids scanning
the whole list by giving an approximation for a long list. Suppose the
limit is N. If the list's length is not greater than N, output the exact
count by traversing the list; if its length is greater than N, output an
approximated count by traversing a subset of the list. The proposed
method is to scan N/2 slabs from the list's head and N/2 slabs from
the tail. For a partial list with ~280K slabs, benchmarks show that
it performs better than just counting from the list's head, after slabs
get sorted by kmem_cache_shrink(). Default the limit to 10000, as it
produces an approximation within 1% of the exact count for both
scenarios. Then, use count_partial_free_approx() in get_slabinfo().

Benchmarks: Diff = (exact - approximated) / exact
* Normal case (w/o kmem_cache_shrink()):
| MAX_TO_SCAN | Diff (count from head)| Diff (count head+tail)|
| 1000        |  0.43  %              |  1.09  %              |
| 5000        |  0.06  %              |  0.37  %              |
| 10000       |  0.02  %              |  0.16  %              |
| 20000       |  0.009 %              | -0.003 %              |

* Skewed case (w/ kmem_cache_shrink()):
| MAX_TO_SCAN | Diff (count from head)| Diff (count head+tail)|
| 1000        |  12.46 %              |  6.75  %              |
| 5000        |  5.38  %              |  1.27  %              |
| 10000       |  4.99  %              |  0.22  %              |
| 20000       |  4.86  %              | -0.06  %              |

[1] https://lore.kernel.org/linux-mm/alpine.DEB.2.21.2003031602460.1537@www.lameter.com/T/
[2] https://lore.kernel.org/lkml/alpine.DEB.2.22.394.2008071258020.55871@www.lameter.com/T/
[3] https://lore.kernel.org/lkml/1e01092b-140d-2bab-aeba-321a74a194ee@linux.com/T/

Signed-off-by: Jianfeng Wang <jianfeng.w.wang@oracle.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 62a0bb2c3147d..7e2a208134c31 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3238,6 +3238,43 @@ static unsigned long count_partial(struct kmem_cache_node *n,
 #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
 
 #ifdef CONFIG_SLUB_DEBUG
+#define MAX_PARTIAL_TO_SCAN 10000
+
+static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
+{
+	unsigned long flags;
+	unsigned long x = 0;
+	struct slab *slab;
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
+		list_for_each_entry(slab, &n->partial, slab_list)
+			x += slab->objects - slab->inuse;
+	} else {
+		/*
+		 * For a long list, approximate the total count of objects in
+		 * it to meet the limit on the number of slabs to scan.
+		 * Scan from both the list's head and tail for better accuracy.
+		 */
+		unsigned long scanned = 0;
+
+		list_for_each_entry(slab, &n->partial, slab_list) {
+			x += slab->objects - slab->inuse;
+			if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
+				break;
+		}
+		list_for_each_entry_reverse(slab, &n->partial, slab_list) {
+			x += slab->objects - slab->inuse;
+			if (++scanned == MAX_PARTIAL_TO_SCAN)
+				break;
+		}
+		x = mult_frac(x, n->nr_partial, scanned);
+		x = min(x, node_nr_objs(n));
+	}
+	spin_unlock_irqrestore(&n->list_lock, flags);
+	return x;
+}
+
 static noinline void
 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 {
@@ -7116,7 +7153,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
 	for_each_kmem_cache_node(s, node, n) {
 		nr_slabs += node_nr_slabs(n);
 		nr_objs += node_nr_objs(n);
-		nr_free += count_partial(n, count_free);
+		nr_free += count_partial_free_approx(n);
 	}
 
 	sinfo->active_objs = nr_objs - nr_free;

From b3d8a8e870144369fdbcbb1a78878ce98532265a Mon Sep 17 00:00:00 2001
From: Jianfeng Wang <jianfeng.w.wang@oracle.com>
Date: Mon, 22 Apr 2024 21:55:54 -0700
Subject: [PATCH 12/15] slub: use count_partial_free_approx() in
 slab_out_of_memory()

slab_out_of_memory() uses count_partial() to get the exact count
of free objects for each node. As it may get called in the slab
allocation path, count_partial_free_approx() can be used to avoid
the risk and overhead of traversing a long partial slab list.

At the same time, show_slab_objects() still uses count_partial().
Thus, slub users can still have the option to access the exact
count of objects via sysfs if the overhead is acceptable to them.

Signed-off-by: Jianfeng Wang <jianfeng.w.wang@oracle.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 7e2a208134c31..3aa12b9b323d9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3301,7 +3301,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 		unsigned long nr_objs;
 		unsigned long nr_free;
 
-		nr_free  = count_partial(n, count_free);
+		nr_free  = count_partial_free_approx(n);
 		nr_slabs = node_nr_slabs(n);
 		nr_objs  = node_nr_objs(n);
 

From 844776cb65a77ef27bfba2220e285940b714ae4e Mon Sep 17 00:00:00 2001
From: linke li <lilinke99@qq.com>
Date: Sat, 27 Apr 2024 16:51:54 +0800
Subject: [PATCH 13/15] mm/slub: mark racy access on slab->freelist

In deactivate_slab(), slab->freelist can be changed concurrently. Mark
data race on slab->freelist as benign using READ_ONCE.

This patch is aimed at reducing the number of benign races reported by
KCSAN in order to focus future debugging effort on harmful races.

Signed-off-by: linke li <lilinke99@qq.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3aa12b9b323d9..88d3dcdfa45d8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2806,7 +2806,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
 	struct slab new;
 	struct slab old;
 
-	if (slab->freelist) {
+	if (READ_ONCE(slab->freelist)) {
 		stat(s, DEACTIVATE_REMOTE_FREES);
 		tail = DEACTIVATE_TO_TAIL;
 	}

From 306c4ac9896b07b8872293eb224058ff83f81fac Mon Sep 17 00:00:00 2001
From: Hyunmin Lee <hyunminlr@gmail.com>
Date: Wed, 24 Apr 2024 23:04:21 +0900
Subject: [PATCH 14/15] mm/slub: create kmalloc 96 and 192 caches regardless
 cache size order

For SLAB the kmalloc caches needed to be created in ascending sizes in
order. However, the constraint is not necessary anymore because SLAB has
been removed and SLUB doesn't need to comply with the constraint. Thus,
kmalloc 96 and 192 caches can be created after the other size kmalloc
caches are created instead of checking every time to find their order to
be created. Also, this change could prevent engineers from being confused
by the removed constraint.

Signed-off-by: Hyunmin Lee <hyunminlr@gmail.com>
Co-developed-by: Jeungwoo Yoo <casionwoo@gmail.com>
Signed-off-by: Jeungwoo Yoo <casionwoo@gmail.com>
Co-developed-by: Sangyun Kim <sangyun.kim@snu.ac.kr>
Signed-off-by: Sangyun Kim <sangyun.kim@snu.ac.kr>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab_common.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 67c03d6bd26cf..7cfdcc8cbf5f8 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -916,21 +916,16 @@ void __init create_kmalloc_caches(void)
 	 * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
 	 */
 	for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
+		/* Caches that are NOT of the two-to-the-power-of size. */
+		if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[type][1])
+			new_kmalloc_cache(1, type);
+		if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[type][2])
+			new_kmalloc_cache(2, type);
+
+		/* Caches that are of the two-to-the-power-of size. */
 		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
 			if (!kmalloc_caches[type][i])
 				new_kmalloc_cache(i, type);
-
-			/*
-			 * Caches that are not of the two-to-the-power-of size.
-			 * These have to be created immediately after the
-			 * earlier power of two caches
-			 */
-			if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
-					!kmalloc_caches[type][1])
-				new_kmalloc_cache(1, type);
-			if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
-					!kmalloc_caches[type][2])
-				new_kmalloc_cache(2, type);
 		}
 	}
 #ifdef CONFIG_RANDOM_KMALLOC_CACHES

From 7338999ca3468404f547b1540211114cbdb26d06 Mon Sep 17 00:00:00 2001
From: Hyunmin Lee <hyunminlr@gmail.com>
Date: Wed, 24 Apr 2024 23:04:22 +0900
Subject: [PATCH 15/15] mm/slub: remove the check for NULL kmalloc_caches

If the same size kmalloc cache already exists, it should not be created
again. So there is the check for NULL kmalloc_caches before calling the
kmalloc creation function. However, new_kmalloc_cache() itself checks NULL
kmalloc_cahces before cache creation. Therefore, the NULL check is not
necessary in this function.

Signed-off-by: Hyunmin Lee <hyunminlr@gmail.com>
Co-developed-by: Jeungwoo Yoo <casionwoo@gmail.com>
Signed-off-by: Jeungwoo Yoo <casionwoo@gmail.com>
Co-developed-by: Sangyun Kim <sangyun.kim@snu.ac.kr>
Signed-off-by: Sangyun Kim <sangyun.kim@snu.ac.kr>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab_common.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 7cfdcc8cbf5f8..c37f8c41ffb00 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -917,16 +917,14 @@ void __init create_kmalloc_caches(void)
 	 */
 	for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
 		/* Caches that are NOT of the two-to-the-power-of size. */
-		if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[type][1])
+		if (KMALLOC_MIN_SIZE <= 32)
 			new_kmalloc_cache(1, type);
-		if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[type][2])
+		if (KMALLOC_MIN_SIZE <= 64)
 			new_kmalloc_cache(2, type);
 
 		/* Caches that are of the two-to-the-power-of size. */
-		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
-			if (!kmalloc_caches[type][i])
-				new_kmalloc_cache(i, type);
-		}
+		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
+			new_kmalloc_cache(i, type);
 	}
 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
 	random_kmalloc_seed = get_random_u64();